import pandas as pd import matplotlib.pyplot as plt # Load the CSV file into a DataFrame df = pd.read_csv('supplementary_dataset_11_full_constraint_metrics.tsv.gz', sep='\t', compression='gzip') df = df[df['canonical'] == True] cols_to_include = ['oe_lof_upper'] # Create a new DataFrame with selected columns new_df = df[['gene'] + cols_to_include] for col in cols_to_include: new_df.loc[:, col + '_rank'] = new_df[col].rank() new_df.rename(columns={'oe_lof_upper': 'LOEUF'}, inplace=True) new_df.rename(columns={'oe_lof_upper_rank': 'LOEUF_rank'}, inplace=True) # Display the new DataFrame print(new_df.head()) new_df.to_csv('LOEUF_scores.csv.gz', index = 0) # oe_lof_values = new_df['LOEUF'] # max_value = oe_lof_values.max() # # Print the maximum and minimum values # print("Maximum value of 'oe_lof' column:", max_value) ## some genes with high intolerance by RVIS # genenow = 'CHD8' # genenow = 'LRP1' # genenow = 'DYNC1H1' # # some genes with low intolerance by RVIS # genenow = 'MKI67' # genenow = 'FLG' # gene_row = new_df[new_df['gene'] == genenow] # # Display the row # print(gene_row)