| |
|
|
| import requests |
| import json |
| import pandas as pd |
| import numpy as np |
|
|
| import requests |
| import re |
| import os |
| import shutil |
|
|
| from Bio.PDB import MMCIFParser |
| import Bio.PDB as PDB |
| from Bio import pairwise2 |
| from Bio.pairwise2 import format_alignment |
| from bs4 import BeautifulSoup |
| import pdb |
|
|
| from fuson_plm.utils.logging import log_update, open_logfile |
|
|
| |
| class AlphaFoldStructure: |
| ''' |
| This class processes an mmCIF file, either uploaded or downloaded from the AlphaFold2 database, to provide comprehensive information. |
| ''' |
| def __init__(self, fold_path=None, uniprot_to_download=None, uniprot_output_dir= None, secondary_structure_types=None): |
| |
| if fold_path is not None: |
| fold_fname = fold_path.split('/')[-1] |
| prefix, suffix = fold_fname.split('.') |
|
|
| if suffix == 'pdb': |
| |
| conversion_path = 'mmcif_converted_files' |
| if not(os.path.exists(conversion_path)): |
| os.makedirs(conversion_path) |
|
|
| fold_path = self.__convert_pdb_to_mmcif__(fold_path, f'{conversion_path}/{prefix}.cif') |
|
|
| self.file_path = fold_path |
|
|
| |
| if uniprot_to_download is not None: |
| if fold_path is not None: |
| log_update("WARNING: both a fold_path and a uniprot_to_download were provided. Running default: downloading the CIF file for provided UniProt ID.") |
| self.file_path = self.__download_mmCIF(uniprot_to_download, output_path=uniprot_output_dir) |
|
|
| |
| if secondary_structure_types is None: |
| self.secondary_structure_types = self.__pull_secondary_structure_types() |
| else: |
| self.secondary_structure_types = secondary_structure_types |
|
|
| |
| if self.file_path: |
| self.cif_lines = self.__parse_cif() |
| self.secondary_structures = self.__extract_secondary_structures() |
| self.structure_dict = self.__calc_pLDDTs() |
| self.sequence = self.structure_dict['seq'] |
| self.plddts = self.structure_dict['res_pLDDTs'] |
| self.avg_pLDDT = self.structure_dict['avg_pLDDT'] |
| self.residues_df = self.__create_residues_summary_dataframe() |
| self.secondary_structures_df = self.__create_secondary_structures_summary_dataframe() |
| |
| else: |
| log_update("ERROR: structure could not be created. No CIF file found.") |
|
|
| def __convert_pdb_to_mmcif__(self, pdb_filename, mmcif_filename): |
| parser = PDB.PDBParser() |
| structure = parser.get_structure('structure', pdb_filename) |
|
|
| io = PDB.MMCIFIO() |
| io.set_structure(structure) |
| io.save(mmcif_filename) |
| return mmcif_filename |
|
|
| def __download_mmCIF(self, uniprot_id, output_path=None): |
| ''' |
| Download mmCIF file with provided uniprot_id and optional output_path for the downloaded file. |
| |
| Return: path to downloaded file if successful, None otherwise |
| ''' |
| full_file_name = f"AF-{uniprot_id}-F1-model_v4.cif" |
| |
| if output_path is None: |
| output_path = full_file_name |
| else: |
| output_path = f"{output_path}/{full_file_name}" |
|
|
| |
| url = f"https://alphafold.ebi.ac.uk/files/{full_file_name}" |
| response = requests.get(url) |
|
|
| if response.status_code == 200: |
| with open(output_path, 'wb') as file: |
| file.write(response.content) |
| |
| else: |
| log_update(f"Failed to download file. Status code: {response.status_code}") |
| return None |
|
|
| return output_path |
|
|
| def __pull_secondary_structure_types(self): |
| ''' |
| Pull a dictionary of secondary structure types and their descriptions from the PDB mmCIF website (necessary for annotating the CIF file) |
| Only called if the user does not provide such a dictionary themselves. |
| ''' |
|
|
| |
| url = "https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_struct_conf_type.id.html" |
| response = requests.get(url) |
|
|
| if response.status_code != 200: |
| raise Exception("Failed to retrieve mmCIF dictionary") |
|
|
| |
| soup = BeautifulSoup(response.content, 'html.parser') |
|
|
| |
| |
| |
| |
| |
|
|
| |
| header = soup.find('h4', class_='panel-title') |
| if header is None or 'Controlled Vocabulary' not in header.text: |
| raise Exception("Could not find the 'Controlled Vocabulary' header") |
|
|
| |
| |
|
|
| |
| table = header.find_next('table') |
| if table is None: |
| raise Exception("Could not find the table following the 'Controlled Vocabulary' header") |
|
|
| |
| |
|
|
| |
| secondary_structure_types = {} |
| rows = table.find_all('tr') |
| for row in rows[1:]: |
| cols = row.find_all('td') |
| if len(cols) > 1: |
| type_id = cols[0].text.strip() |
| description = cols[1].text.replace('\t', ' ').strip() |
|
|
| |
| description = re.sub(' +', ' ', description) |
|
|
| |
| if '(protein)' in description: |
| secondary_structure_types[type_id] = description |
|
|
| return secondary_structure_types |
|
|
| def get_secondary_structure_types(self): |
| ''' |
| Display secondary structure types |
| ''' |
| log_update("Secondary Structure Types in mmCIF files:") |
| for ss_type, description in self.secondary_structure_types.items(): |
| log_update(f"{ss_type}: {description}") |
|
|
| return self.secondary_structure_types |
|
|
| def __parse_cif(self): |
| ''' |
| Read cif file lines from self.file_path |
| ''' |
| with open(self.file_path, 'r') as file: |
| lines = file.readlines() |
| return lines |
|
|
| def __extract_secondary_structures(self): |
| ''' |
| Iterate through the lines of the cif files to find each secondary structure. |
| Returns a tuple for each amino acid that has a secondary structure annotation. Tuple contains: |
| 1. Structure Type (e.g. STRN) |
| 2. Structure ID (e.g. STRN1) |
| 3. Description (e.g. beta strand) |
| 4. Position (e.g. 3) |
| ''' |
| secondary_structures = [] |
| parsing_secondary_structure = False |
|
|
| |
| for line in self.cif_lines: |
| |
| if line.startswith("_struct_conf.conf_type_id"): |
| parsing_secondary_structure = True |
| continue |
| |
| if parsing_secondary_structure: |
| if line.startswith("#"): |
| parsing_secondary_structure = False |
| continue |
| |
| columns = line.split() |
| |
| if len(columns) >= 7: |
| sec_struc_type = columns[6] |
| sec_struc_id = columns[13] |
| start_res = int(columns[2]) |
| end_res = int(columns[9]) |
| sec_struc_name = self.secondary_structure_types.get(sec_struc_type, 'Unknown') |
| |
| for pos in range(start_res, end_res + 1): |
| secondary_structures.append((sec_struc_type, sec_struc_id, sec_struc_name, pos)) |
|
|
| return secondary_structures |
|
|
| def __calc_pLDDTs(self): |
| ''' |
| This method iterates through the cif file to return a dictionary with a few key pieces of info: |
| 1. Sequence |
| 2. pLDDTs for each residue |
| 3. Average pLDDT |
| ''' |
|
|
| |
| aa_dict = { |
| "ALA": "A", "CYS": "C", "ASP": "D", "GLU": "E", "PHE": "F", |
| "GLY": "G", "HIS": "H", "ILE": "I", "LYS": "K", "LEU": "L", |
| "MET": "M", "ASN": "N", "PRO": "P", "GLN": "Q", "ARG": "R", |
| "SER": "S", "THR": "T", "VAL": "V", "TRP": "W", "TYR": "Y" |
| } |
|
|
| parser = MMCIFParser(QUIET=True) |
| data = parser.get_structure("structure", self.file_path) |
|
|
| |
| model = data.get_models() |
| models = list(model) |
| chains = list(models[0].get_chains()) |
|
|
| |
| all_pLDDTs = [] |
| for n in range(len(chains)): |
| chainname = chr(n + 65) |
| residues = list(chains[n].get_residues()) |
| seq = '' |
| pLDDTs = [0] * len(residues) |
|
|
| |
| for i in range(len(residues)): |
| r = residues[i] |
| |
| try: |
| seq += aa_dict[r.get_resname()] |
| |
| except KeyError: |
| log_update('residue name invalid') |
| break |
|
|
| |
| atoms = list(r.get_atoms()) |
| bfactor = atoms[0].get_bfactor() |
| for a in range(len(atoms)): |
| |
| if atoms[a].get_bfactor() != bfactor: |
| break |
|
|
| pLDDTs[i] = bfactor |
|
|
| all_pLDDTs.extend(pLDDTs) |
|
|
| avg_pLDDT = np.mean(all_pLDDTs) |
| return_dict = { |
| 'avg_pLDDT': round(avg_pLDDT, 2), |
| 'res_pLDDTs': all_pLDDTs, |
| 'seq': seq |
| } |
| return return_dict |
|
|
| def __create_residues_summary_dataframe(self): |
| ''' |
| Create a dataframe that summarizes the secondary structure information for each residue. |
| Columns: |
| 1. Position: amino acid position (e.g. 3) |
| 2. Residue: amino acid 1-letter code (e.g. A) |
| 3. pLDDT: alphafold2's pLDDT score for this residue to 2 decimal places (e.g. 77.54) |
| 4. Structure Type: type of secondary structure (e.g. STRN) |
| 5. Structure ID: ID of this secondary structure (e.g. STRN1) |
| 5. Description: description of this secondary structure (e.g. beta strand) |
| 6. Disordered: is this residue disordered or not? A residue is not disordered if it's in a HELX or STRN. (True/False) |
| |
| ''' |
| |
| df_secondary_structures = pd.DataFrame(self.secondary_structures, columns=['Structure Type', 'Structure ID', 'Description', 'Position']) |
|
|
| |
| df_temp = pd.DataFrame( |
| data={ |
| 'Position': list(range(1, len(self.sequence) + 1)), |
| 'Residue': list(self.sequence), |
| 'pLDDT': self.plddts |
| }) |
|
|
| df_secondary_structures = pd.merge(df_secondary_structures, df_temp, on='Position', how='right') |
| |
| df_secondary_structures['Disordered'] = df_secondary_structures['Structure Type'].apply( |
| lambda x: False if (type(x)==str and (('HELX' in x) or ('STRN' in x))) else True |
| ) |
|
|
| return df_secondary_structures |
|
|
| def __create_secondary_structures_summary_dataframe(self): |
| ''' |
| Create a dataframe grouped by each Structure ID, providing a summary of each secondary structure in the chain. |
| Columns: |
| 1. Structure ID: ID of this secondary structure (e.g. STRN1) |
| 2. Start: start position of this secondary structure (e.g. 3) |
| 3. End: end position of this secondary structure (e.g. 12) |
| 4. Start Residue: amino acid 1-letter code of the start position (e.g. A) |
| 5. End Residue: amino acid 1-letter code of the end position (e.g. L) |
| 6. Disordered: is this residue disordered or not? A residue is not disordered if it's in a HELX or STRN. (True/False) |
| 7. Description: description of this secondary structure (e.g. beta strand) |
| 8. Structure Type: type of secondary structure (e.g. STRN) |
| 9. avg_pLDDT: average pLDDT for this secondary structure (e.g. 77.54) |
| ''' |
|
|
| |
| secondary_structures_df = self.residues_df.groupby('Structure ID').agg({ |
| 'Position': ['first', 'last'], |
| 'Residue': ['first','last'], |
| 'Disordered': 'first', |
| 'Description': 'first', |
| 'Structure Type': 'first', |
| 'pLDDT': 'mean' |
| }).reset_index() |
|
|
| |
| secondary_structures_df.columns = ['Structure ID', 'Start', 'End', 'Start Residue', 'End Residue', 'Disordered', 'Description', 'Structure Type', 'avg_pLDDT'] |
| secondary_structures_df['avg_pLDDT'] = secondary_structures_df['avg_pLDDT'].round(2) |
|
|
| |
| return secondary_structures_df |
|
|
| def get_residues_df(self): |
| return self.residues_df |
|
|
| def get_secondary_structures_df(self): |
| return self.secondary_structures_df |
|
|
| def get_full_sequence(self): |
| return ''.join([res for res in self.residues_df['Residue']]) |
|
|
| def get_average_plddt(self): |
| plddt_values = [plddt for plddt in self.residues_df['pLDDT'] if plddt is not None] |
| return sum(plddt_values) / len(plddt_values) if plddt_values else None |
|
|
| def pull_secondary_structure_types(): |
| url = "https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_struct_conf_type.id.html" |
| response = requests.get(url) |
|
|
| if response.status_code != 200: |
| raise Exception("Failed to retrieve mmCIF dictionary") |
|
|
| soup = BeautifulSoup(response.content, 'html.parser') |
|
|
| |
| |
| |
| with open('mmcif_dictionary.txt', 'w') as f: |
| f.write(soup.prettify()) |
|
|
| |
| header = soup.find('h4', class_='panel-title') |
| if header is None or 'Controlled Vocabulary' not in header.text: |
| raise Exception("Could not find the 'Controlled Vocabulary' header") |
|
|
| |
| |
|
|
| |
| table = header.find_next('table') |
| if table is None: |
| raise Exception("Could not find the table following the 'Controlled Vocabulary' header") |
|
|
| |
| |
|
|
| secondary_structure_types = {} |
| rows = table.find_all('tr') |
| for row in rows[1:]: |
| cols = row.find_all('td') |
| if len(cols) > 1: |
| type_id = cols[0].text.strip() |
| description = cols[1].text.replace('\t', ' ').strip() |
|
|
| |
| description = re.sub(' +', ' ', description) |
|
|
| if '(protein)' in description: |
| secondary_structure_types[type_id] = description |
|
|
| return secondary_structure_types |
|
|
| |
| def process_fusionpdb_fusion_files(files, level_2_3_structure_info, folder, save_path=None): |
| |
| secondary_structure_types = pull_secondary_structure_types() |
|
|
| |
| level_2_3_structure_info['Fold AA seq'] = ['']*len(level_2_3_structure_info) |
| level_2_3_structure_info['Avg pLDDT'] = [0]*len(level_2_3_structure_info) |
| level_2_3_structure_info['pLDDTs'] = ['']*len(level_2_3_structure_info) |
| |
| |
| pre_loop_processed = [] |
| if os.path.exists(save_path): |
| pre_loop_processed = pd.read_csv(save_path) |
| pre_loop_processed = pre_loop_processed['Structure Link'].tolist() |
| pre_loop_processed = [x.split('/')[-1] for x in pre_loop_processed] |
| log_update(f"Total structures already processed: {len(pre_loop_processed)}") |
| |
| log_update("\nProcessing fusion structures...") |
| |
| for i, structure in enumerate(files): |
| log_update(f'\tProcessing #{i+1}: {structure}') |
| |
| |
| if structure in pre_loop_processed: |
| log_update(f"\t\tAlready processed. Continuing...") |
| continue |
| |
| |
| obj = AlphaFoldStructure(fold_path=f'{folder}/{structure}', secondary_structure_types=secondary_structure_types) |
| aa_seq = obj.get_full_sequence() |
| avg_plddt = obj.get_average_plddt() |
| residues_df = obj.get_residues_df() |
| all_plddts = ",".join(residues_df['pLDDT'].astype(str).tolist()) |
| |
| log_update(f"\t\tAvg pLDDT: {round(avg_plddt,2)}\tFold AA seq: {aa_seq}\tFirst 5 pLDDTs: {','.join(all_plddts.split(',')[0:5])}") |
|
|
| level_2_3_structure_info.loc[level_2_3_structure_info['Structure Link'].str.contains(f"/{structure}"), 'Fold AA seq'] = aa_seq |
| level_2_3_structure_info.loc[level_2_3_structure_info['Structure Link'].str.contains(f"/{structure}"), 'Avg pLDDT'] = avg_plddt |
| level_2_3_structure_info.loc[level_2_3_structure_info['Structure Link'].str.contains(f"/{structure}"), 'pLDDTs'] = all_plddts |
| |
| |
| cur_df = level_2_3_structure_info.loc[level_2_3_structure_info['Structure Link'].str.contains(f"/{structure}")].reset_index(drop=True) |
| if os.path.exists(save_path): |
| cur_df.to_csv(save_path,mode='a',header=False,index=False) |
| else: |
| cur_df.to_csv(save_path,index=False) |
| |
| |
| level_2_3_structure_info = pd.read_csv(save_path) |
| return level_2_3_structure_info |
|
|
| def process_fusionpdb_head_tail_files(ht, save_path='heads_and_tails_structures_processed.csv'): |
| |
| log_update("\nProcessing head and tail structures...") |
| |
| |
| secondary_structure_types = pull_secondary_structure_types() |
| |
| |
| os.makedirs('raw_data/fusionpdb/head_tail_af2db_structures',exist_ok=True) |
| |
| |
| pre_loop_processed = [] |
| if os.path.exists(save_path): |
| pre_loop_processed = pd.read_csv(save_path) |
| pre_loop_processed = pre_loop_processed['UniProtID'].tolist() |
| log_update(f"Heads and tails already processed: {len(pre_loop_processed)}") |
| |
| ht_structures_df = pd.DataFrame( |
| data = { |
| 'UniProtID': ['']*len(ht), |
| 'Avg pLDDT': ['']*len(ht), |
| 'All pLDDTs': ['']*len(ht), |
| 'Seq': ['']*len(ht) |
| } |
| ) |
| |
| for i, uniprotid in enumerate(ht): |
| log_update(f'\tProcessing #{i+1}: {uniprotid}') |
| aa_seq, avg_plddt, all_plddts = None, None, None |
| |
| |
| if uniprotid in pre_loop_processed: |
| log_update(f"\t\tAlready processed. Continuing") |
| continue |
|
|
| try: |
| obj = AlphaFoldStructure(uniprot_to_download=uniprotid, secondary_structure_types=secondary_structure_types, |
| uniprot_output_dir='raw_data/fusionpdb/head_tail_af2db_structures') |
| aa_seq = obj.get_full_sequence() |
| avg_plddt = obj.get_average_plddt() |
| residues_df = obj.get_residues_df() |
| all_plddts = ",".join(residues_df['pLDDT'].astype(str).tolist()) |
|
|
| log_update(f"\t\tAvg pLDDT: {round(avg_plddt,2)}\tFold AA seq: {aa_seq}\tFirst 5 pLDDTs: {','.join(all_plddts.split(',')[0:5])}") |
|
|
| except: |
| log_update(f"\t\tAvg pLDDT: {None}\tFold AA seq: {None}\tFirst 5 pLDDTs: {None}") |
| |
| |
| ht_structures_df.loc[i, 'UniProtID'] = uniprotid |
| ht_structures_df.loc[i, 'Avg pLDDT'] = avg_plddt |
| ht_structures_df.loc[i, 'All pLDDTs'] = all_plddts |
| ht_structures_df.loc[i, 'Seq'] = aa_seq |
| |
| |
| cur_df = pd.DataFrame(ht_structures_df.iloc[i,:]).T.reset_index(drop=True) |
| if os.path.exists(save_path): |
| cur_df.to_csv(save_path,mode='a',header=False,index=False) |
| else: |
| cur_df.to_csv(save_path,index=False) |
| |
| |
| ht_structures_df = pd.read_csv(save_path) |
| level_2_3 = pd.read_csv(f'processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv') |
| level_2_3['FusionGene'] = level_2_3['FusionGene'].str.replace('-','::') |
| heads = level_2_3['HGUniProtAcc'].tolist() |
| tails = level_2_3['TGUniProtAcc'].tolist() |
| ht = heads + tails |
| ht = set([x for x in ht if type(x)==str]) |
| ht = set(','.join(ht).split(',')) |
|
|
| log_update(f"total heads and tails: {len(ht)}") |
| log_update(f"total processed: {len(ht_structures_df)}\t{len(ht_structures_df['UniProtID'].unique())}") |
|
|
| |
| missing = set(ht) - set(ht_structures_df['UniProtID'].unique()) |
| log_update(f"missing: {len(missing)}") |
| log_update(missing) |
| |
| |
| ht_structures_df = ht_structures_df.replace('',np.nan) |
| need_to_fold = ht_structures_df[ht_structures_df['Avg pLDDT'].isna()]['UniProtID'].tolist() |
| with open('processed_data/fusionpdb/intermediates/uniprotids_not_in_afdb.txt','w') as f: |
| for uniprotid in need_to_fold: |
| f.write(f'{uniprotid}\n') |
|
|
| idmap = pd.read_csv(f'raw_data/fusionpdb/not_in_afdb_idmap.txt',sep='\t') |
| idmap = idmap[idmap['Entry'].isin(need_to_fold)].reset_index(drop=True) |
| idmap = idmap[['Entry','Sequence']].rename(columns={ |
| 'Entry': 'ID'}) |
| idmap['Length'] = idmap['Sequence'].apply(len) |
|
|
| log_update("Investigating heads and tails that were not in the AF2 database:") |
| log_update(f"\tMin length: {min(idmap['Length'])}") |
| log_update(f"\tMax length: {max(idmap['Length'])}") |
| idmap = idmap.sort_values(by='Length',ascending=True).reset_index(drop=True) |
| |
| |
| id='Q9NNW7' |
| if id in idmap['ID'].tolist(): |
| ht_structures_df.loc[ |
| ht_structures_df['UniProtID']=='Q9NNW7', 'Avg pLDDT' |
| ] = 91.68 |
| ht_structures_df.loc[ |
| ht_structures_df['UniProtID']=='Q9NNW7', 'Seq' |
| ] = idmap.loc[ |
| idmap['ID']=='Q9NNW7', 'Sequence' |
| ].item() |
| |
| |
| id='Q16881' |
| if id in idmap['ID'].tolist(): |
| ht_structures_df.loc[ |
| ht_structures_df['UniProtID']==id, 'Avg pLDDT' |
| ] = 89.55 |
| ht_structures_df.loc[ |
| ht_structures_df['UniProtID']==id, 'Seq' |
| ] = idmap.loc[ |
| idmap['ID']==id, 'Sequence' |
| ].item() |
| |
| |
| id='Q86V15' |
| if id in idmap['ID'].tolist(): |
| ht_structures_df.loc[ |
| ht_structures_df['UniProtID']==id, 'Avg pLDDT' |
| ] = 48.14 |
| ht_structures_df.loc[ |
| ht_structures_df['UniProtID']==id, 'Seq' |
| ] = idmap.loc[ |
| idmap['ID']==id, 'Sequence' |
| ].item() |
| |
| return ht_structures_df |
|
|
| def process_fusions_and_hts(): |
| |
| level_2_3_structure_info_og = pd.read_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv') |
|
|
| |
| folder = 'raw_data/fusionpdb/structures' |
| |
| files = os.listdir(folder) |
| log_update(f"total pdbs: {len(files)}") |
| log_update(f"examples: {files[:5]}") |
| |
| os.makedirs('processed_data/fusionpdb', exist_ok=True) |
| |
| |
| level_2_3_structure_info = process_fusionpdb_fusion_files(files, level_2_3_structure_info_og, folder, save_path='processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structures_processed.csv') |
| |
| |
| level_2_3 = pd.read_csv(f'processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv') |
| level_2_3['FusionGene'] = level_2_3['FusionGene'].str.replace('-','::') |
| |
| heads = level_2_3['HGUniProtAcc'].tolist() |
| tails = level_2_3['TGUniProtAcc'].tolist() |
| ht = heads + tails |
| ht = set([x for x in ht if type(x)==str]) |
| ht = set(','.join(ht).split(',')) |
| log_update(f"Unique heads/tails: {len(ht)}") |
| |
| heads_tails_analyzed = process_fusionpdb_head_tail_files(list(ht), save_path='processed_data/fusionpdb/heads_tails_structural_data.csv') |
| |
| |
| level_2 = pd.read_csv(f'raw_data/fusionpdb/FusionPDB_level2_curated_09_05_2024.csv') |
| level_3 = pd.read_csv(f'raw_data/fusionpdb/FusionPDB_level3_curated_09_05_2024.csv') |
| joined_23 = pd.concat([level_2,level_3]).reset_index(drop=True) |
| joined_23['FusionGene'] = joined_23['FusionGene'].str.replace('-','::') |
| log_update(f"\nnumber of duplicated fusion gene rows: {len(joined_23[joined_23['FusionGene'].duplicated()])}") |
| |
| fo_gid_dict = dict(zip(joined_23['FusionGene'],joined_23['FusionGID'])) |
| log_update(len(fo_gid_dict)) |
|
|
| |
| |
| level_2_3_structure_info_clean = level_2_3_structure_info.replace('',np.nan) |
| level_2_3_structure_info_clean = level_2_3_structure_info_clean.dropna(subset=['Fold AA seq']).reset_index(drop=True) |
| log_update(f"length of processed structure file: {len(level_2_3_structure_info_clean)}") |
| level_2_3_structure_info_clean['pLDDT'] = level_2_3_structure_info_clean['Avg pLDDT'].round(2) |
| level_2_3_structure_info_clean = level_2_3_structure_info_clean.drop(columns=['Avg pLDDT']) |
| level_2_3_structure_info_clean['FusionGene'] = level_2_3_structure_info_clean['FusionGene'].str.replace('-','::') |
| level_2_3_structure_info_clean['FusionGID'] = level_2_3_structure_info_clean['FusionGene'].apply(lambda x: fo_gid_dict[x]) |
| |
| |
| log_update("Using FusionPDB as ground truth for sequences...") |
| raw_download = pd.read_csv('../../data/raw_data/FusionPDB.txt',sep='\t',header=None) |
| raw_download['FusionGene'] = raw_download[7]+ '::' + raw_download[11] |
| raw_download = raw_download.rename(columns={18:'Raw Download AA Seq'}) |
| log_update(f"FusionPDB raw download size: {len(raw_download)}") |
|
|
| level_2_3_structure_info_clean_ids = set(level_2_3_structure_info_clean['FusionGene'].tolist()) |
| level_2_3_structure_info_clean_seqs = set(level_2_3_structure_info_clean['Fold AA seq'].tolist()) |
| raw_download_ids = set(raw_download['FusionGene'].tolist()) |
| raw_download_seqs = set(raw_download['Raw Download AA Seq'].tolist()) |
| log_update(f"Number of overlapping gene IDs: {len(level_2_3_structure_info_clean_ids.intersection(raw_download_ids))}") |
| log_update(f"Number of overlapping sequences: {len(level_2_3_structure_info_clean_seqs.intersection(raw_download_seqs))}") |
| |
| |
| test_merge_1 = pd.merge( |
| level_2_3_structure_info_clean.rename(columns={'Fold AA seq': 'Raw Download AA Seq'}), |
| raw_download, |
| on=['FusionGene','Raw Download AA Seq'], |
| how='inner' |
| ) |
| test_merge_1 = test_merge_1.drop(columns=['AA seq']) |
| test_merge_1['Seq Source'] = ['AlphaFold,Raw Download']*len(test_merge_1) |
| log_update(f"Merge on AlphaFold AA Seq and raw Download AA Seq. len={len(test_merge_1)}") |
| |
| test_merge_2 = pd.merge( |
| level_2_3_structure_info_clean.rename(columns={'AA seq': 'Raw Download AA Seq'}), |
| raw_download, |
| on=['FusionGene','Raw Download AA Seq'], |
| how='inner' |
| ) |
| test_merge_2 = test_merge_2.drop(columns=['Fold AA seq']) |
| test_merge_2['Seq Source'] = ['Webpage,Raw Download']*len(test_merge_2) |
| log_update(f"Merge on Webpage AA Seq and Raw Download AA Seq. len={len(test_merge_2)}") |
|
|
| test_merge = pd.concat([test_merge_1,test_merge_2]) |
| test_merge['Len(AA seq)'] = test_merge['Raw Download AA Seq'].apply(lambda x: len(x)) |
| |
| test_merge = test_merge.drop_duplicates().reset_index(drop=True) |
| |
| |
| log_update(f"len test_merge before keeping CIFs over identical PDBs: {len(test_merge)}") |
| test_merge = test_merge.sort_values(by='Structure Type',ascending=True).reset_index(drop=True).groupby(['Hgene', 'Hchr', 'Hbp', 'Hstrand', 'Tgene', 'Tchr', |
| 'Tbp', 'Tstrand', 'Len(AA seq)', 'FusionGene', |
| 'Level', 'Raw Download AA Seq', 'pLDDT', 'pLDDTs','FusionGID', 'Seq Source']).agg( |
| { |
| 'Structure Link': 'first', |
| 'Structure Type': 'first' |
| } |
| ).reset_index() |
| log_update(f"len after: {len(test_merge)}") |
| |
| |
| log_update(f"len test_merge before combining seq sources: {len(test_merge)}") |
| test_merge = test_merge.groupby(['Structure Link','Hgene', 'Hchr', 'Hbp', 'Hstrand', 'Tgene', 'Tchr', |
| 'Tbp', 'Tstrand', 'Len(AA seq)', 'FusionGene','Structure Type', |
| 'Level', 'Raw Download AA Seq', 'pLDDT', 'pLDDTs', 'FusionGID', ]).agg( |
| { |
| 'Seq Source': lambda x: ','.join(x) |
| } |
| ).reset_index() |
| test_merge['Seq Source'] = test_merge['Seq Source'].apply(lambda x: ','.join(set(x.split(',')))) |
| log_update(f"len after: {len(test_merge)}") |
| |
| |
| dup_seqs = test_merge[test_merge['Raw Download AA Seq'].duplicated()]['Raw Download AA Seq'].unique().tolist() |
|
|
| |
| log_update(f"len test_merge before randomly choosing first fold when one seq has multiple folds: {len(test_merge)}") |
| test_merge = test_merge.groupby(['Hgene', 'Hchr', 'Hbp', 'Hstrand', 'Tgene', 'Tchr', |
| 'Tbp', 'Tstrand', 'Len(AA seq)', 'FusionGene', |
| 'Level', 'Raw Download AA Seq', 'FusionGID', ]).agg( |
| { |
| 'Structure Link': 'first', |
| 'Structure Type': 'first', |
| 'Seq Source': 'first', |
| 'pLDDT': 'first', |
| 'pLDDTs': 'first' |
| } |
| ).reset_index() |
| log_update(f"len after: {len(test_merge)}") |
| |
| |
| source_str = test_merge['Seq Source'].value_counts().reset_index().rename(columns={'index': 'Seq Source','Seq Source': 'count'}).to_string(index=False) |
| source_str = "\t\t" + source_str.replace("\n","\n\t\t") |
| log_update(f"Distribution of sequence sources:\n{source_str}") |
| |
| |
| test_merge = test_merge.loc[test_merge['Seq Source'].str.contains('AlphaFold')].reset_index(drop=True) |
| log_update(f"Dropped rows where AlphaFold sequence was incorrect. New DataFrame length: {len(test_merge)}") |
| |
| assert len(test_merge[test_merge.duplicated(['FusionGID','Raw Download AA Seq'])])==0 |
| |
| |
| test_merge['pLDDT'] = test_merge['pLDDT'].round(2) |
| |
| |
| test_merge_v2 = test_merge[ |
| ['FusionGID', 'FusionGene', 'Raw Download AA Seq','Len(AA seq)', 'Hgene', 'Hchr', 'Hbp', 'Hstrand', 'Tgene', 'Tchr', 'Tbp', 'Tstrand', |
| 'Level','Structure Link', 'Structure Type', 'pLDDT', 'pLDDTs', 'Seq Source'] |
| ].rename( |
| columns={ |
| 'Raw Download AA Seq': 'Fusion_Seq', |
| 'Seq Source': 'Fusion_Seq_Source', |
| 'Structure Link': 'Fusion_Structure_Link', |
| 'Structure Type': 'Fusion_Structure_Type', |
| 'pLDDT': 'Fusion_pLDDT', |
| 'pLDDTs': 'Fusion_AA_pLDDTs', |
| 'Len(AA seq)': 'Fusion_Length' |
| } |
| ) |
| log_update(f"Unique FusionGIDs: {len(test_merge_v2['FusionGID'].unique())}") |
| log_update(f"Number of structures: {len(test_merge_v2)}") |
| |
| |
| log_update("\nChecking for duplicate sequences..") |
| log_update(f"\tThe structure-based fusion database of length {len(test_merge_v2)} has {len(test_merge_v2['Fusion_Seq'].unique())} unique fusion sequences.") |
| dup_seqs = test_merge_v2[test_merge_v2['Fusion_Seq'].duplicated()]['Fusion_Seq'].tolist() |
| dup_seqs_df = test_merge_v2.loc[test_merge_v2['Fusion_Seq'].isin(dup_seqs)].reset_index(drop=True) |
| dup_seqs_df['FusionGID'] = dup_seqs_df['FusionGID'].astype(str) |
| dup_seqs_df = dup_seqs_df.groupby('Fusion_Seq').agg({ |
| 'FusionGID': lambda x: ','.join(x), |
| 'FusionGene': lambda x: ','.join(x) |
| }) |
| dup_seqs_df_str = dup_seqs_df.to_string(index=False) |
| dup_seqs_df_str = "\t"+dup_seqs_df_str.replace("\n","\n\t") |
| log_update(f"\tShowing FUsionGIDs and FusionGenes for duplicated sequences below:\n{dup_seqs_df_str}") |
| |
| |
| heads_tails_analyzed['Avg pLDDT'] = heads_tails_analyzed['Avg pLDDT'].round(2) |
| |
| level_2_3_v2 = pd.merge( |
| level_2_3, |
| heads_tails_analyzed.rename(columns={'UniProtID': 'HGUniProtAcc', 'Avg pLDDT': 'HG_pLDDT', 'All pLDDTs': 'HG_AA_pLDDTs', 'Seq': 'HG_Seq'}), |
| on='HGUniProtAcc', |
| how='left' |
| ) |
| |
| level_2_3_v2 = pd.merge( |
| level_2_3_v2, |
| heads_tails_analyzed.rename(columns={'UniProtID': 'TGUniProtAcc', 'Avg pLDDT': 'TG_pLDDT', 'All pLDDTs': 'TG_AA_pLDDTs', 'Seq': 'TG_Seq'}), |
| on='TGUniProtAcc', |
| how='left' |
| ) |
| |
| |
| test_merge_v2.to_csv(f'processed_data/fusionpdb/FusionPDB_level2-3_cleaned_structure_info.csv',index=False) |
| log_update("Saved file with all fusion structure pLDDTs to: processed_data/fusionpdb/FusionPDB_level2-3_cleaned_structure_info.csv") |
| |
| |
| level_2_3_v2.to_csv(f'processed_data/fusionpdb/FusionPDB_level2-3_cleaned_FusionGID_info.csv',index=False) |
| log_update("Saved file with all fusion protein heads and tails, and their structure pLDDTs to: processed_data/fusionpdb/FusionPDB_level2-3_cleaned_FusionGID_info.csv") |
| |
| def main(): |
| with open_logfile("process_fusion_structures_log.txt"): |
| process_fusions_and_hts() |
| |
| if __name__ == "__main__": |
| main() |
| |
|
|