### Clean the Salokas data, find TF and Kinase fusions in the test set import pandas as pd import os def get_gene_type(gene, d): if gene in d: if d[gene] == 'kinase': return 'Kinase' if d[gene] == 'tf': return 'TF' else: return 'Other' # Load TF and Kinase Fusions def main(): os.makedirs("processed_data", exist_ok=True) tf_kinase_parts = pd.read_csv("raw_data/salokas_2020_tableS3.csv") print(tf_kinase_parts) ht_tf_kinase_dict = dict(zip(tf_kinase_parts['Gene'],tf_kinase_parts['Kinase or TF'])) ## Categorize everything in fuson_db fuson_db = pd.read_csv("../../../data/fuson_db.csv") print(fuson_db['benchmark'].value_counts()) print(fuson_db.loc[fuson_db['benchmark'].notna()]) fgenes = fuson_db.loc[fuson_db['benchmark'].notna()]['fusiongenes'].to_list() print(fuson_db.columns) print(fuson_db) # This one has each row with one fusiongene name fuson_ht_db = pd.read_csv("../../../data/blast/fuson_ht_db.csv") print(fuson_ht_db.columns) print(fuson_ht_db) fuson_ht_db[['hg','tg']] = fuson_ht_db['fusiongenes'].str.split("::",expand=True) print(fuson_ht_db.loc[fuson_ht_db['hg']=='PAX3']) print(fuson_ht_db) fuson_ht_db['hg_type'] = fuson_ht_db['hg'].apply(lambda x: get_gene_type(x, ht_tf_kinase_dict)) fuson_ht_db['tg_type'] = fuson_ht_db['tg'].apply(lambda x: get_gene_type(x, ht_tf_kinase_dict)) fuson_ht_db['fusion_type'] = fuson_ht_db['hg_type']+'::'+fuson_ht_db['tg_type'] fuson_ht_db['type']=['fusion']*len(fuson_ht_db) # Keep things in the test set test_set = pd.read_csv("../../../data/splits/test_df.csv") print(test_set.columns, len(test_set)) test_seqs = test_set['sequence'].tolist() fuson_ht_db = fuson_ht_db.loc[ fuson_ht_db['aa_seq'].isin(test_seqs) ].sort_values(by=['fusion_type']).reset_index(drop=True) fuson_ht_db.to_csv("processed_data/test_seqs_tftf_kk.csv", index=False) # isolate a few transcription factor fusions of interest and keep the longest sequence of each fusion_genes_of_interest = [ "EWSR1::FLI1", "PAX3::FOXO1", "TRIM24::RET", "ETV6::NTRK3" ] df_of_interest = fuson_ht_db.loc[ fuson_ht_db['fusiongenes'].isin(fusion_genes_of_interest) ].sort_values(by=['fusiongenes','length'],ascending=[True,False]).reset_index(drop=True).drop_duplicates(subset='fusiongenes').reset_index(drop=True) #df_of_interest.to_csv("domain_conservation_fusions.csv",index=False) # Make a file for input into discovery_input = df_of_interest[['fusiongenes','length','aa_seq']] discovery_input['start_residue_index'] = [1]*len(discovery_input) discovery_input['n'] = [3]*len(discovery_input) discovery_input = discovery_input.rename(columns={'length':'end_residue_index', 'aa_seq': 'full_fusion_sequence', 'fusiongenes':'fusion_name'}) discovery_input[['fusion_name','full_fusion_sequence','start_residue_index','end_residue_index','n']].to_csv("processed_data/domain_conservation_fusions_inputfile.csv",index=False) print(discovery_input) if __name__ == "__main__": main()