import pandas as pd # learn PFAM id to PFAM name mapping pfam_data = pd.read_excel('Pfam-A.clans.xlsx', header=None) pfam_names = pfam_data.iloc[:, [3, 4]] # pfam_names.to_csv('gene_domainsnames.txt') pfam_names.to_csv('domain-mapnames.txt', sep='\t', index=False, header=False) pfam_data = pfam_data.iloc[:, [0, 3]] pfam_data.columns = ['PFAMid', 'PFAMname'] pfam_mapping = pfam_data.set_index('PFAMid')['PFAMname'].to_dict() # load PFAM id - gene name table df = pd.read_excel('mart_export.xlsx') df.dropna(subset=['Pfam ID', 'Gene name'], inplace=True) # Replace Pfam IDs with Pfam domain names df["Pfam ID"] = df["Pfam ID"].map(pfam_mapping) # Group by 'Gene name' and aggregate 'Pfam ID' into a tab-separated list df_grouped = df.groupby('Gene name')['Pfam ID'].apply(lambda x: '\t'.join(str(v) for v in x if pd.notna(v))).reset_index() df_grouped.columns = ['Gene name', 'Domain names'] # Save to TSV file tsv_file = 'domain-map.txt' df_grouped.to_csv(tsv_file, sep='\t', index=False) # Read the file and replace double quotes with spaces with open('domain-map.txt', 'r') as file: content = file.read().replace('"', ' ') # Write the modified content back to a new TSV file with open('domain-map.txt', 'w') as file: file.write(content)