import pandas as pd import re ###### READ GENE - GO ANNOTATIONS file_path = 'goa_human.gaf.gz' df = pd.read_csv(file_path, comment='!', sep='\t', header=None) df_filtered = df[df[0] == 'UniProtKB'] # Extract the gene names (column 2) and GO terms (column 4) go_map = df_filtered[[2, 4]] go_map.columns = ['Gene', 'GO_term'] ###### INFER GO HIERARCHY # Define the file path file_path = 'go.obo' # Lists to store extracted ids and names ids = [] gos = [] name_dict = {} # Read the file and extract id and name with open(file_path, 'r') as file: content = file.read() # Find all terms in the file terms = re.findall(r'\[Term\](.*?)(?=\[Term\]|\Z)', content, re.DOTALL) for term in terms: # Check if the term is obsolete is_obsolete_match = re.search(r'is_obsolete:\s*true', term) if is_obsolete_match: continue # Skip this term if it is obsolete # Extract id and name from the term id_match = re.search(r'id:\s*(GO:\d+)', term) name_match = re.search(r'name:\s*(.+)', term) # Extract all is_a relationships and join them with | # go_matches = re.findall(r'is_a:\s*(GO:\d+)', term) go_matches = re.findall(r'(is_a|part_of):\s*(GO:\d+)', term) go_ids = [match[1] for match in go_matches] if id_match and name_match: go_id = id_match.group(1) go_name = name_match.group(1).replace(' ', '_') ids.append(go_id) gos.append('|'.join(go_ids)) # Join all is_a and part_of values with | name_dict[go_id] = go_name # Create a DataFrame from the lists df = pd.DataFrame({'id': ids, 'gos': gos}) df['gos'] = df['gos'].str.split('|') go_parents = df.explode('gos', ignore_index=True) ###### GO_parents have GO and its "is a" or "part of" parents ### here expand it to include all ancestors of GO terms # Create a dictionary to quickly look up parents for any term parents_dict = {} for _, row in go_parents.iterrows(): if row['id'] not in parents_dict: parents_dict[row['id']] = [] parents_dict[row['id']].append(row['gos']) # Function to recursively find all ancestors for a given term def get_all_ancestors(term): ancestors = set() # To avoid duplicates terms_to_process = [term] while terms_to_process: current_term = terms_to_process.pop() if current_term in parents_dict: for parent in parents_dict[current_term]: if parent not in ancestors: ancestors.add(parent) terms_to_process.append(parent) return list(ancestors) # List to store the expanded rows expanded_rows = [] # For each term in go_parents, find all ancestors and add rows for _, row in go_parents.iterrows(): ancestors = get_all_ancestors(row['id']) for ancestor in ancestors: expanded_rows.append({'id': row['id'], 'gos': ancestor}) # Convert the expanded rows into a DataFrame expanded_go_parents = pd.DataFrame(expanded_rows) expanded_go_parents = expanded_go_parents.drop_duplicates() #### add self id to gos for easy merge expanded_go_parents['gos'] = expanded_go_parents.apply( lambda row: row['id'] if row['gos'] == '' else row['gos'], axis=1 ) # Only change formatting of the output labels expanded_go_parents['gos_label'] = expanded_go_parents['gos'].apply( lambda go_id: f"{go_id}-{name_dict.get(go_id, go_id)}" ) #### MERGE EXPANDED GO ANCESTORY LIST WITH GO_MAP (GENES VS GO TERMS) GOmap = pd.merge(go_map, expanded_go_parents, left_on='GO_term', right_on='id', how='right').dropna() GOmap = GOmap[['Gene', 'gos_label']].drop_duplicates() GOmap_grouped = GOmap.groupby('Gene')['gos_label'].apply(lambda x: ' '.join(x)).reset_index() GOmap_grouped.to_csv('GO-map.txt', sep='\t', index=False, header=['#geneNS', 'sym']) ###### EXTRACT GO AND THEIR NAMES file_path = 'go.obo' output_file_path = 'GO-mapnames.txt' # Lists to store extracted ids and names ids = [] names = [] # Read the file and extract id and name with open(file_path, 'r') as file: content = file.read() # Find all terms in the file terms = re.findall(r'\[Term\](.*?)(?=\[Term\]|\Z)', content, re.DOTALL) for term in terms: # Extract id and name from the term id_match = re.search(r'id:\s*(GO:\d+)', term) name_match = re.search(r'name:\s*(.+)', term) if id_match and name_match: go_id = id_match.group(1) name = name_match.group(1).replace(' ', '_') ids.append(f"{go_id}-{name}") names.append(name) # Write the data to GO-mapnames.txt with open(output_file_path, 'w') as file: # Write the extracted ids and names for id_, name in zip(ids, names): go_id_only = id_.split('-')[0] link = f"https://www.ebi.ac.uk/QuickGO/term/{go_id_only}" file.write(f"{id_}\t{link}\n")