import pandas as pd # Load the data, try both comma and tab delimiters file_path = "SFARI-Gene_cnvs_10-09-2024release_12-11-2024export.csv" df = pd.read_csv(file_path, delimiter=',') # Create a function to extract chromosome from CNV locus def get_chromosome(cnv_locus): # Extract the part before the first 'p' or 'q' for i, char in enumerate(cnv_locus): if char in ['p', 'q']: return "chr" + cnv_locus[:i] return "chr" + cnv_locus # In case no 'p' or 'q' is found # Create the new DataFrame new_df = pd.DataFrame({ "chrom": df["cnv-locus"].apply(get_chromosome), "pos_beg": df["basepair-range"].str.split("-").str[0].astype(int), "pos_end": df["basepair-range"].str.split("-").str[1].astype(int), "CNV_name": df["cnv-locus"] + " " + df["cnv-type"], "region_cytoband": df["cnv-locus"], "deletion_duplication": df["cnv-type"], "deletion-values": df["deletion-values"], "duplication-values": df["duplication-values"], "animal-model": df["animal-model"].fillna(''), "number-of-reports": df["number-of-reports"], "number-case-population": df["number-case-population"], "number-case-individuals": df["number-case-individuals"] }) # Replace 'Array' in 'animal-model' with 'yes' new_df["animal-model"] = new_df["animal-model"].replace("Array", "yes") # Save the result to a new CSV new_df.to_csv("SFARI_gene_CNV.txt", sep = '\t', index=False)