import pandas as pd
import re

###### READ GENE - GO ANNOTATIONS
file_path = 'goa_human.gaf.gz'
df = pd.read_csv(file_path, comment='!', sep='\t', header=None)
df_filtered = df[df[0] == 'UniProtKB']
# Extract the gene names (column 2) and GO terms (column 4)
go_map = df_filtered[[2, 4]]
go_map.columns = ['Gene', 'GO_term']


###### INFER GO HIERARCHY
# Define the file path
file_path = 'go.obo'
# Lists to store extracted ids and names
ids = []
gos = []
name_dict = {}
# Read the file and extract id and name
with open(file_path, 'r') as file:
    content = file.read()
    
    # Find all terms in the file
    terms = re.findall(r'\[Term\](.*?)(?=\[Term\]|\Z)', content, re.DOTALL)
    
    for term in terms:
        # Check if the term is obsolete
        is_obsolete_match = re.search(r'is_obsolete:\s*true', term)
        if is_obsolete_match:
            continue  # Skip this term if it is obsolete
        
        # Extract id and name from the term
        id_match = re.search(r'id:\s*(GO:\d+)', term)
        name_match = re.search(r'name:\s*(.+)', term)
        
        # Extract all is_a relationships and join them with |
        # go_matches = re.findall(r'is_a:\s*(GO:\d+)', term)
        go_matches = re.findall(r'(is_a|part_of):\s*(GO:\d+)', term)
        
        go_ids = [match[1] for match in go_matches]

        if id_match and name_match:
            go_id = id_match.group(1)
            go_name = name_match.group(1).replace(' ', '_')
            ids.append(go_id)
            gos.append('|'.join(go_ids))  # Join all is_a and part_of values with |
            name_dict[go_id] = go_name

# Create a DataFrame from the lists
df = pd.DataFrame({'id': ids, 'gos': gos})
df['gos'] = df['gos'].str.split('|')
go_parents = df.explode('gos', ignore_index=True)


###### GO_parents have GO and its "is a" or "part of" parents
### here expand it to include all ancestors of GO terms
# Create a dictionary to quickly look up parents for any term
parents_dict = {}
for _, row in go_parents.iterrows():
    if row['id'] not in parents_dict:
        parents_dict[row['id']] = []
    parents_dict[row['id']].append(row['gos'])

# Function to recursively find all ancestors for a given term
def get_all_ancestors(term):
    ancestors = set()  # To avoid duplicates
    terms_to_process = [term]
    
    while terms_to_process:
        current_term = terms_to_process.pop()
        if current_term in parents_dict:
            for parent in parents_dict[current_term]:
                if parent not in ancestors:
                    ancestors.add(parent)
                    terms_to_process.append(parent)
    
    return list(ancestors)

# List to store the expanded rows
expanded_rows = []

# For each term in go_parents, find all ancestors and add rows
for _, row in go_parents.iterrows():
    ancestors = get_all_ancestors(row['id'])
    for ancestor in ancestors:
        expanded_rows.append({'id': row['id'], 'gos': ancestor})

# Convert the expanded rows into a DataFrame
expanded_go_parents = pd.DataFrame(expanded_rows)
expanded_go_parents = expanded_go_parents.drop_duplicates()

#### add self id to gos for easy merge
expanded_go_parents['gos'] = expanded_go_parents.apply(
    lambda row: row['id'] if row['gos'] == '' else row['gos'], axis=1
)

# Only change formatting of the output labels
expanded_go_parents['gos_label'] = expanded_go_parents['gos'].apply(
    lambda go_id: f"{go_id}-{name_dict.get(go_id, go_id)}"
)


#### MERGE EXPANDED GO ANCESTORY LIST WITH GO_MAP (GENES VS GO TERMS)
GOmap = pd.merge(go_map, expanded_go_parents, left_on='GO_term', right_on='id', how='right').dropna()
GOmap = GOmap[['Gene', 'gos_label']].drop_duplicates()
GOmap_grouped = GOmap.groupby('Gene')['gos_label'].apply(lambda x: ' '.join(x)).reset_index()
GOmap_grouped.to_csv('GO-map.txt', sep='\t', index=False, header=['#geneNS', 'sym'])


###### EXTRACT GO AND THEIR NAMES
file_path = 'go.obo'
output_file_path = 'GO-mapnames.txt'

# Lists to store extracted ids and names
ids = []
names = []

# Read the file and extract id and name
with open(file_path, 'r') as file:
    content = file.read()
    
    # Find all terms in the file
    terms = re.findall(r'\[Term\](.*?)(?=\[Term\]|\Z)', content, re.DOTALL)
    
    for term in terms:
        # Extract id and name from the term
        id_match = re.search(r'id:\s*(GO:\d+)', term)
        name_match = re.search(r'name:\s*(.+)', term)
        
        if id_match and name_match:
            go_id = id_match.group(1)
            name = name_match.group(1).replace(' ', '_')
            ids.append(f"{go_id}-{name}")
            names.append(name)

# Write the data to GO-mapnames.txt
with open(output_file_path, 'w') as file:
    # Write the extracted ids and names
    for id_, name in zip(ids, names):
        go_id_only = id_.split('-')[0]
        link = f"https://www.ebi.ac.uk/QuickGO/term/{go_id_only}"
        file.write(f"{id_}\t{link}\n")