| |
| import requests |
| import pandas as pd |
| import numpy as np |
| from bs4 import BeautifulSoup |
| import glob |
| import ast |
| import os |
| from pandas.errors import EmptyDataError |
|
|
| from fuson_plm.utils.logging import open_logfile, log_update, get_local_date_yr |
|
|
| def get_levels_dataframe(level, print_progress=False): |
| data, headers = scrape_level(level) |
|
|
| if print_progress: |
| |
| if level==2: |
| log_update(f'\nTable size {len(data)}; expected 2212') |
| if level==3: |
| log_update(f'\nTable size {len(data)}; expected 266') |
| log_update('Example rows 1-5:') |
| for i, row in enumerate(data): |
| log_update(row) |
| if i>5: break |
|
|
| df = pd.DataFrame(data, columns=headers) |
| df['URL'] = df['FusionGID'].apply(lambda x: x[1]) |
| df['FusionGID'] = df['FusionGID'].apply(lambda x: x[0]) |
| return df |
|
|
| def scrape_level(level): |
| level = str(level) |
| |
| url = f"https://compbio.uth.edu/FusionPDB/gene_search_result_0.cgi?type=chooseLevel&chooseLevel=level{level}" |
|
|
| |
| response = requests.get(url) |
|
|
| |
| soup = BeautifulSoup(response.content, 'html.parser') |
|
|
| |
| if level == '2': |
| specific_h1 = soup.find('h1', text='2212 Fusion gene(s) for your query: level2') |
| if level== '3': |
| specific_h1 = soup.find('h1', text='266 Fusion gene(s) for your query: level3') |
|
|
| |
| table = specific_h1.find_next('table', class_='geneList') |
|
|
| |
| headers = [header.get_text().strip() for header in table.find_all('td', class_='content_middle_gene_summary')][0:6] |
|
|
| |
| rows = table.find_all('tr')[1:] |
|
|
| |
| data = [] |
| for row in rows: |
| columns = row.find_all('td', class_='content_middle_gene_summary') |
| if not columns: |
| continue |
|
|
| row_data = [] |
| for column in columns: |
| link = column.find('a') |
| if link: |
| href = link['href'] |
| fusion_gid = link.get_text(strip=True) |
| full_url = f"https://compbio.uth.edu/FusionPDB/{href}" |
| row_data.append((fusion_gid, full_url)) |
| else: |
| row_data.append(column.get_text(strip=True)) |
| data.append(row_data) |
|
|
| return data, headers |
|
|
| def get_structure_link_dataframe(id, print_progress=False): |
| rows = get_structure_links(id) |
|
|
| |
| if print_progress: |
| log_update(f'\nTable size {len(rows)}') |
| log_update('Example rows 1-5:') |
| for i, row in enumerate(rows): |
| log_update(row) |
| if i>5: break |
|
|
| |
| if len(rows)>0: |
| df = pd.DataFrame(rows) |
| df = df.rename(columns={ |
| 'Fusion protein PDB link (fusion AA seq ID in FusionPDB)': 'Structure Link' |
| }) |
| |
| df = df.explode('Structure Link').reset_index(drop=True) |
|
|
| df['Structure Link'] = df['Structure Link'].apply(lambda x: 'https://compbio.uth.edu/FusionPDB/' + str(x)) |
| df['Structure Type'] = df['Structure Link'].apply(lambda x: 'PDB' if 'pdb_files' in x else ('CIF' if 'cif_files' in x else 'Unknown')) |
| df['FO_Name'] = df['Hgene'] + '::' + df['Tgene'] |
| |
| df = df.rename(columns={'FO_Name':'FusionGene'}) |
| df['ID'] = [id]*len(df) |
| |
| expected_cols = ['ID','Structure Link','Hgene','Hchr','Hbp','Hstrand','Tgene','Tchr','Tbp','Tstrand','Len(AA seq)','Structure Type','FusionGene','AA seq'] |
| for col in expected_cols: |
| if not(col in list(df.columns)): |
| df[col] = ['']*len(df) |
| df = df[expected_cols] |
| |
| else: |
| df = pd.DataFrame() |
|
|
| return df |
|
|
| def get_structure_links(id, print_progress=False): |
| |
| url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}" |
|
|
| |
| response = requests.get(url) |
| html_content = response.content |
|
|
| |
| soup = BeautifulSoup(html_content, 'html.parser') |
| |
| |
|
|
| |
| table_title = soup.find('a', {'name': 'FusionSTR'}) |
| rows = [] |
| |
| if table_title and table_title.find('h2').text.strip() == 'Fusion Protein Structures': |
| |
| table = table_title.find_next('table', class_='geneList') |
| table = table.find_next('table') |
|
|
| if table: |
| if print_progress: log_update('table found') |
| |
| header_row = table.find('tr') |
| headers = [header.get_text(strip=True) for header in header_row.find_all('strong')] |
|
|
| |
| rows = [] |
| for row in table.find_all('tr')[1:]: |
| cells = row.find_all('td') |
| row_data = {} |
| skip_next = False |
| for i, cell in enumerate(cells): |
| |
| if skip_next: |
| skip_next = False |
| continue |
|
|
| cell_text = cell.get_text(strip=True) |
| if "3D view using mol*" in cell_text: |
| skip_next = True |
| continue |
|
|
| links = cell.find_all('a') |
| if links: |
| row_data[headers[i]] = [link.get('href') for link in links] |
| else: |
| celltext = cell.get_text(strip=True) |
| if len(celltext)>0: |
| row_data[headers[i]] = celltext |
| if len(row_data)>0: rows.append(row_data) |
| else: |
| log_update('table not found') |
|
|
| return rows |
|
|
| def process_td_elements(soup_object, add_links=False): |
| |
| td_elements = soup_object.find_all('td', class_='content_left_gene_summary') |
|
|
| |
| data = [] |
|
|
| for td in td_elements: |
| |
| strong_tag = td.find('strong') |
| if strong_tag: |
| text_content = strong_tag.get_text(strip=True) |
| else: |
| text_content = td.get_text(strip=True) |
|
|
| |
| if add_links: |
| link_tag = td.find('a') |
| if link_tag: |
| link = link_tag.get('href') |
| text_content += f" ({link})" |
|
|
| data.append(text_content) |
|
|
| return data |
|
|
| def get_hgene_tgene_info(id, print_progress=False): |
| |
| url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}" |
|
|
| |
| response = requests.get(url) |
| html_content = response.content |
|
|
| |
| soup = BeautifulSoup(html_content, 'html.parser') |
| |
| |
|
|
| |
| title_table = soup.find('table', class_='title') |
| if title_table and title_table.find('h2') and title_table.find('h2').get_text(strip=True) == 'Fusion Protein Summary': |
| |
| gene_list_table = title_table.find_next_sibling('table', class_='geneList').find_next_sibling('table', class_='geneList') |
|
|
| |
| data = { |
| "Fusion gene name": [], |
| "FusionPDB ID": [], |
| "FusionGDB2.0 ID": [], |
| "Gene symbol": [], |
| "Gene ID": [], |
| "Gene name": [], |
| "Synonyms": [], |
| "Cytomap": [], |
| "Type of gene": [], |
| "Description": [], |
| "Modification date": [], |
| "UniProtAcc": [] |
| } |
|
|
| td_data = process_td_elements(gene_list_table) |
|
|
| |
| split_ind = td_data.index('Gene symbol') |
| fusion_info, ht_info = [td_data[0:split_ind], td_data[split_ind::]] |
|
|
| |
| for info in fusion_info: |
| if ':' in info: |
| |
| key, value = info.split(':')[0:2] |
| if key in data: |
| data[key.strip()] = value.strip() |
|
|
| |
| |
| j_start=0 |
| for i in range(0, len(ht_info), 3): |
| |
| |
| key, value1, value2 = ht_info[i:i+3] |
| if key in data: |
| data[key.strip()] = [value1.strip(), value2.strip()] |
| if key=='UniProtAcc': |
| break |
|
|
| return data |
|
|
| def process_ids(ids, outdir='', level=2): |
| csv_filename = f'{outdir}/FusionPDB_level{level}_fusion_structure_links.csv' |
| already_processed_ids = [] |
| if os.path.isfile(csv_filename): |
| already_processed_ids = pd.read_csv(csv_filename) |
| already_processed_ids = already_processed_ids['ID'].tolist() |
| |
| structureless_ids = pd.read_csv("raw_data/fusionpdb/fusionpdb_structureless_ids.txt",sep="\t",header=None)[0].tolist() |
| |
| log_update(f'\nLevel {level}:\n\tDownloading structure links for FusionPDB IDs:') |
| for i, id in enumerate(ids): |
| |
| if (id in already_processed_ids) or (id in structureless_ids): |
| continue |
| df = get_structure_link_dataframe(id) |
| if os.path.isfile(csv_filename): |
| df.to_csv(csv_filename, mode='a', index=False,header=False) |
| else: |
| df.to_csv(csv_filename, mode='w', index=False) |
| |
| log_update(f'\t\t{i+1}. {id}') |
| |
| def process_ids_ht(ids, outdir='',level=2): |
| outfile = f'{outdir}/level{level}_head_tail_info.txt' |
| if not(os.path.isfile(outfile)): |
| log_update(f"\n\tAcquiring UniProt accessions of head and tail genes for Level {level}") |
| with open(outfile, 'a+') as f1: |
| for id in ids: |
| data = get_hgene_tgene_info(id) |
| data = { |
| 'FusionGID': data['FusionPDB ID'], |
| 'HGID': data['Gene ID'][0], |
| 'TGID': data['Gene ID'][1], |
| 'HGUniProtAcc': data['UniProtAcc'][0], |
| 'TGUniProtAcc': data['UniProtAcc'][1] |
| } |
| f1.write(str(data)) |
| f1.write('\n') |
| f1.flush() |
| else: |
| log_update(f"\nAlready acquired UniProt accessions of head and tail genes for Level {level} at: {outfile}") |
| |
| def download_file(url, directory): |
| |
| local_filename = os.path.join(directory, url.split('/')[-1]) |
| if os.path.exists(local_filename): |
| return local_filename |
| response = requests.get(url) |
| response.raise_for_status() |
| with open(local_filename, 'wb') as file: |
| file.write(response.content) |
| return local_filename |
| |
| def download_structures(download_links): |
| |
| download_directory = "raw_data/fusionpdb/structures" |
| os.makedirs(download_directory, exist_ok=True) |
| |
| |
| for link in download_links: |
| try: |
| log_update(f"Downloading {link}...") |
| download_file(link, download_directory) |
| log_update(f"\tDownloaded {link} to {download_directory}") |
| except Exception as e: |
| log_update(f"\tFailed to download {link}. Reason: {e}") |
|
|
| log_update("All downloads completed.") |
| |
| def combine_ht_info(): |
| |
| outdir = 'raw_data/fusionpdb' |
| head_tail_data = [] |
| with open(f'{outdir}/level2_head_tail_info.txt','r') as f: |
| for line in f: |
| |
| record = ast.literal_eval(line.strip()) |
| head_tail_data.append(record) |
|
|
| with open(f'{outdir}/level3_head_tail_info.txt','r') as f: |
| for line in f: |
| |
| record = ast.literal_eval(line.strip()) |
| head_tail_data.append(record) |
| |
| ht_df = pd.DataFrame(head_tail_data) |
| ht_df['FusionGID'] = ht_df['FusionGID'].astype(str) |
| return ht_df |
|
|
| |
| def find_h_source(row): |
| if row['HGUniProtAcc'] is not None: |
| return 'FusionPDB' |
| elif row['Entry_Hgene'] is not None: |
| return 'UniProt ID Map' |
| else: |
| return None |
|
|
| def find_t_source(row): |
| if row['TGUniProtAcc'] is not None: |
| return 'FusionPDB' |
| elif row['Entry_Tgene'] is not None: |
| return 'UniProt ID Map' |
| else: |
| return None |
|
|
| def correct_huniprot(row): |
| if row['HGUniProtAcc'] is not None: |
| return row['HGUniProtAcc'] |
| elif row['Entry_Hgene'] is not None: |
| return row['Entry_Hgene'] |
| else: |
| return None |
|
|
| def correct_tuniprot(row): |
| if row['TGUniProtAcc'] is not None: |
| return row['TGUniProtAcc'] |
| elif row['Entry_Tgene'] is not None: |
| return row['Entry_Tgene'] |
| else: |
| return None |
| |
| def combine_ht_info_with_structure_links(giant, ht_df): |
| |
| giant_with_hts = pd.merge(giant, ht_df, on='FusionGID', how='left') |
| |
| giant_with_hts['HGID_x'] = giant_with_hts['HGID_x'].astype(str) |
| giant_with_hts['HGID_y'] = giant_with_hts['HGID_y'].astype(str) |
| giant_with_hts['TGID_x'] = giant_with_hts['TGID_x'].astype(str) |
| giant_with_hts['TGID_y'] = giant_with_hts['TGID_y'].astype(str) |
| |
| giant_with_hts['HGID_match'] = giant_with_hts['HGID_x'] == giant_with_hts['HGID_y'] |
| giant_with_hts['TGID_match'] = giant_with_hts['TGID_x'] == giant_with_hts['TGID_y'] |
|
|
| |
| assert giant_with_hts['HGID_match'].all() and giant_with_hts['TGID_match'].all() |
| |
| giant_with_hts = giant_with_hts.drop(['HGID_x','TGID_x','HGID_match','TGID_match'],axis=1).rename(columns={'HGID_y':'HGID','TGID_y':'TGID'}) |
| giant_with_hts = giant_with_hts.replace('.',np.nan) |
| |
| |
| hgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].isna()]) |
| tgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].notna()]) |
| hgid_and_tgid = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].notna()]) |
| neither = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].isna()]) |
|
|
| log_update(f"\nFusions with HGID only: {hgid_only}") |
| log_update(f"Fusions with TGID only: {tgid_only}") |
| log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}") |
| log_update(f"Fusions with neither: {neither}") |
| log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(giant_with_hts)}") |
| |
| |
| unmapped_h = set(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna()]['Hgene'].tolist()) |
| unmapped_t = set(giant_with_hts[giant_with_hts['TGUniProtAcc'].isna()]['Tgene'].tolist()) |
|
|
| unmapped_parts = unmapped_h.union(unmapped_t) |
| log_update(f"unmapped hgenes: {len(unmapped_h)}") |
| log_update(f"unmapped tgenes: {len(unmapped_t)}") |
| log_update(f"unmapped parts (hgids or tgids): {len(unmapped_parts)}") |
| |
| |
| wrong_uniprot_ids =[ |
| 'PRY', |
| 'TIAF1', |
| 'DCAF8L2', |
| 'UMAD1', |
| 'TIPIN', |
| 'GAB3', |
| 'OTOA', |
| 'PAGR1', |
| 'PRY2', |
| 'FAM178A', |
| 'SPATS2L', |
| 'VMAC', |
| 'ZNFX1', |
| 'TFPT', |
| 'TRANK1', |
| 'RRP15', |
| 'PAXBP1', |
| 'RB1CC1', |
| 'PACRGL', |
| 'TRMT1L', |
| 'PPPDE2', |
| 'YY1AP1', |
| 'RGP1', |
| 'SHKBP1', |
| 'RINT1', |
| 'PRAM1', |
| 'PIR', |
| 'TMBIM6', |
| 'PICK1', |
| 'PLEC', |
| 'NUDCD3', |
| 'CCBL1', |
| 'S100PBP', |
| 'RTL1', |
| 'C10orf140', |
| 'CD177', |
| 'SLF2', |
| 'STARD3NL', |
| 'RELL2', |
| 'AMIGO1', |
| 'TRAF3IP1', |
| 'PNOC', |
| 'PERM1', |
| 'UBE2F', |
| 'TBKBP1', |
| 'PAN3', |
| 'NSFL1C', |
| 'SPAST', |
| 'TOX4', |
| 'RGPD8', |
| 'ZDHHC9', |
| 'SLAMF9', |
| 'TNNT1', |
| 'TEKT5', |
| 'TPI1', |
| 'TAAR6', |
| 'SKIDA1', |
| 'PMS1' |
| ] |
| |
| wrong_uniprot_ids += giant_with_hts[ |
| ~(giant_with_hts['HGUniProtAcc'].isna()) & |
| (giant_with_hts['HGUniProtAcc'].str.contains(",")) |
| ]['HGUniProtAcc'].tolist() |
| |
| wrong_uniprot_ids += giant_with_hts[ |
| ~(giant_with_hts['TGUniProtAcc'].isna()) & |
| (giant_with_hts['TGUniProtAcc'].str.contains(",")) |
| ]['TGUniProtAcc'].tolist() |
|
|
| |
| hts_tomap_part2 = giant_with_hts[giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids)]['Hgene'].tolist() |
| hts_tomap_part2 += giant_with_hts[giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids)]['Tgene'].tolist() |
| hts_tomap_part2 = set(hts_tomap_part2) |
| log_update(f"Total head and tail genes that need to be mapped again: {len(hts_tomap_part2)}") |
| |
| |
| with open('processed_data/fusionpdb/intermediates/unmapped_parts.txt','w') as f: |
| for part in unmapped_parts: |
| f.write(f'{part}\n') |
| for part in hts_tomap_part2: |
| f.write(f'{part}\n') |
| |
| |
| giant_with_hts.loc[ |
| giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids), |
| 'HGUniProtAcc' |
| ] = np.nan |
| giant_with_hts.loc[ |
| giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids), |
| 'TGUniProtAcc' |
| ] = np.nan |
| |
| |
| idmap = pd.read_csv(f'raw_data/fusionpdb/hgene_tgene_uniprot_idmap_07_10_2024.txt',sep='\t') |
| |
| idmap['n_GeneID'] = idmap['GeneID'].apply(lambda x: [y for y in str(x).strip().split(';') if len(y)>0]) |
| idmap['n_GeneID'] = idmap['n_GeneID'].apply(lambda x: len(x)) |
| |
| log_update(f"Genes may have the following total #s of gene IDs: {idmap['n_GeneID'].unique()}") |
| |
|
|
| |
| log_update(f"All GeneIDs end in ; {idmap['GeneID'].apply(lambda x: x[-1] == ';' if type(x)==str else True).all()}") |
| |
|
|
| |
| |
| idmap_merge = pd.merge(giant_with_hts, idmap[['From','Entry','GeneID']].rename(columns={'From':'Hgene', 'Entry': 'Entry_Hgene', 'GeneID': 'GeneID_Hgene'}), on='Hgene',how='left') |
| idmap_merge = pd.merge(idmap_merge, idmap[['From','Entry','GeneID']].rename(columns={'From':'Tgene', 'Entry': 'Entry_Tgene', 'GeneID': 'GeneID_Tgene'}), on='Tgene',how='left') |
| |
| idmap_merge['HGID;'] = idmap_merge['HGID'].astype(str) + ';' |
| idmap_merge['TGID;'] = idmap_merge['TGID'].astype(str) + ';' |
|
|
| |
| idmap_merge['HGID_Found'] = idmap_merge.apply(lambda row: row['HGID;'] in str(row['GeneID_Hgene']), axis=1) |
| idmap_merge['TGID_Found'] = idmap_merge.apply(lambda row: row['TGID;'] in str(row['GeneID_Tgene']), axis=1) |
| |
| |
| |
| |
| idmap_merge_success = idmap_merge.loc[ |
| |
| ((idmap_merge['HGUniProtAcc'].notna()) & (idmap_merge['TGUniProtAcc'].notna())) | |
| |
| ((idmap_merge['HGUniProtAcc'].isna()) & (idmap_merge['HGID_Found']==True)) | |
| |
| ((idmap_merge['TGUniProtAcc'].isna()) & (idmap_merge['TGID_Found']==True)) |
| ].reset_index(drop=True) |
| idmap_merge_success['FusionGID'] = idmap_merge_success['FusionGID'].astype(str) |
| log_update(f"rows: {len(idmap_merge_success)}") |
| log_update(f"unique successful fusion GIDs: {len(idmap_merge_success['FusionGID'].unique())}") |
| |
| |
| |
| |
| |
| |
| |
|
|
| partition1 = idmap_merge_success.loc[ |
| |
| ((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna())) |
| ].reset_index(drop=True) |
| partition1_gids = set(partition1['FusionGID'].tolist()) |
| log_update("Partition 1: HGUniProtAcc.notna() and TGUniProtAcc.notna() --> both UniProt accessions were found on FusionPDB") |
| log_update(f"\t# GIDs: {len(partition1_gids)}") |
|
|
| partition2 = idmap_merge_success.loc[ |
| |
| (idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True) & |
| ~(idmap_merge_success['FusionGID'].isin(partition1_gids)) |
| ].reset_index(drop=True) |
| partition2_gids = set(partition2['FusionGID'].tolist()) |
| log_update("Partition 2: HGID_Found & TGID_Found --> both UniProt accessions were mapped successfully; one or both was found by remapping on UniProt") |
| log_update(f"\t# GIDs: {len(partition2_gids)}") |
|
|
| partition3 = idmap_merge_success.loc[ |
| |
| ~( |
| ((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna())) | |
| ((idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True)) |
| ) & |
| |
| ((idmap_merge_success['HGID_Found']==True) | (idmap_merge_success['TGID_Found']==True)) & |
| |
| ~(idmap_merge_success['FusionGID'].isin(partition1_gids)) & |
| ~(idmap_merge_success['FusionGID'].isin(partition2_gids)) |
| ].reset_index(drop=True) |
| partition3_gids = set(partition3['FusionGID'].tolist()) |
| log_update("Partition 3: HGID_Found or TGID_Found --> only one was successful, this was the best we can do") |
| log_update(f"\t# GIDs: {len(partition3_gids)}") |
|
|
| |
| partition1_dups = partition1[partition1.duplicated('FusionGID')]['FusionGID'].unique().tolist() |
| partition2_dups = partition2[partition2.duplicated('FusionGID')]['FusionGID'].unique().tolist() |
| partition3_dups = partition3[partition3.duplicated('FusionGID')]['FusionGID'].unique().tolist() |
|
|
| log_update(f"\nDuplicate IDs in partition 1: {len(partition1_dups)}") |
| log_update(f"Duplicate IDs in partition 2: {len(partition2_dups)}") |
| log_update(f"Duplicate IDs in partition 3: {len(partition3_dups)} \tDuplicate rows: {len(partition3[partition3['FusionGID'].isin(partition3_dups)])}") |
|
|
| log_update(f"\nRows in original dataset: {len(idmap_merge_success)}") |
| log_update(f"Rows in partitions: {len(partition1)+len(partition2)+len(partition3)}") |
|
|
| |
| all_starting_fusiongids = set(idmap_merge_success['FusionGID'].tolist()) |
| all_fusiongids = set(partition1['FusionGID'].tolist()) | set(partition2['FusionGID'].tolist()) | set(partition3['FusionGID'].tolist()) |
| log_update(f"\nFusion GIDs captured in original dataset: {len(all_fusiongids)} {len(partition1_gids)+len(partition2_gids)+len(partition3_gids)}") |
| log_update(f"Fusion GIDs captured in the 3 partitions: {len(all_starting_fusiongids)}") |
| log_update(f"Same set? {all_starting_fusiongids == all_fusiongids}") |
|
|
| |
| partition3['Entry_Hgene'] = partition3['Entry_Hgene'].astype(str) |
| partition3['Entry_Tgene'] = partition3['Entry_Tgene'].astype(str) |
|
|
| partition3 = partition3.groupby('FusionGID').agg({ |
| 'FusionGID': 'first', |
| 'FusionGene': 'first', |
| 'Hgene': 'first', |
| 'Tgene': 'first', |
| 'URL': 'first', |
| 'HGID': 'first', |
| 'TGID': 'first', |
| 'HGUniProtAcc': 'first', |
| 'TGUniProtAcc': 'first', |
| 'Entry_Hgene': lambda x: ','.join(set([y for y in x])), |
| 'GeneID_Hgene': 'first', |
| 'Entry_Tgene': lambda x: ','.join(set([y for y in x])), |
| 'GeneID_Tgene': 'first', |
| 'HGID;': 'first', |
| 'TGID;': 'first', |
| 'HGID_Found': 'first', |
| 'TGID_Found': 'first' |
| } |
| ).reset_index(drop=True) |
| |
| |
| recombined = pd.concat( |
| [ |
| partition1, |
| partition2, |
| partition3 |
| ] |
| ).reset_index(drop=True) |
| |
| log_update(f"Duplicate GID rows: {len(recombined[recombined.duplicated('FusionGID')])}") |
| recombined = recombined.replace({np.nan: None, 'nan': None}) |
| |
| |
| recombined['HGUniProtAcc_Source'] = recombined.apply(lambda row: find_h_source(row), axis=1) |
| recombined['TGUniProtAcc_Source'] = recombined.apply(lambda row: find_t_source(row), axis=1) |
| recombined['HGUniProtAcc'] = recombined.apply(lambda row: correct_huniprot(row), axis=1) |
| recombined['TGUniProtAcc'] = recombined.apply(lambda row: correct_tuniprot(row), axis=1) |
|
|
| |
| log_update(f"Every row with UniProt ID Map as HGUniProtAcc_Source has an Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']=='UniProt ID Map']['Entry_Hgene'].apply(lambda x: x is not None).all()}") |
| log_update(f"Every row with UniProt ID Map as TGUniProtAcc_Source has an Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']=='UniProt ID Map']['Entry_Tgene'].apply(lambda x: x is not None).all()}") |
| log_update(f"Every row with no HGUniProtAcc_Source has no Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']==None]['Entry_Hgene'].apply(lambda x: x is None).all()}") |
| log_update(f"Every row with no TGUniProtAcc_Source has no Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']==None]['Entry_Tgene'].apply(lambda x: x is None).all()}") |
| |
| |
| recombined = recombined[['FusionGID', 'FusionGene', 'Hgene', 'Tgene', 'URL', 'HGID', 'TGID', 'HGUniProtAcc', 'TGUniProtAcc', 'HGUniProtAcc_Source', 'TGUniProtAcc_Source']] |
| recombined = recombined.replace({None: np.nan}) |
| |
| |
| hgid_only = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].isna()]) |
| tgid_only = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].notna()]) |
| hgid_and_tgid = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].notna()]) |
| neither = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].isna()]) |
|
|
| log_update(f"Fusions with HGID only: {hgid_only}") |
| log_update(f"Fusions with TGID only: {tgid_only}") |
| log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}") |
| log_update(f"Fusions with neither: {neither}") |
| log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(recombined)}") |
|
|
| return recombined |
|
|
| def scrape_fusionpdb_level_2_3(): |
| |
| os.makedirs("raw_data/fusionpdb",exist_ok=True) |
| os.makedirs("processed_data/fusionpdb",exist_ok=True) |
| os.makedirs("processed_data/fusionpdb/intermediates",exist_ok=True) |
| matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level2_curated_*.csv') |
| if len(matching_file)>0: |
| log_update(f"\nLevel 2 was already scraped for IDs. Loading data from: {matching_file[0]}") |
| level2_df = pd.read_csv(matching_file[0]) |
| else: |
| log_update(f"\nScraping Level 2 IDs from FusionPDB") |
| dt_tag = get_local_date_yr() |
| level2_df = get_levels_dataframe(2, print_progress=True) |
| level2_df['FusionGID'] = level2_df['FusionGID'].astype(str) |
| level2_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level2_curated_{dt_tag}.csv',index=False) |
| |
| |
| matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level3_curated_*.csv') |
| if len(matching_file)>>0: |
| log_update(f"\nLevel 3 was already scraped for IDs. Loading data from: {matching_file[0]}") |
| level3_df = pd.read_csv(matching_file[0]) |
| else: |
| log_update(f"\nScraping Level 3 IDs from FusionPDB") |
| dt_tag = get_local_date_yr() |
| level3_df = get_levels_dataframe(3, print_progress=True) |
| level3_df['FusionGID'] = level3_df['FusionGID'].astype(str) |
| level3_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level3_curated_{dt_tag}.csv',index=False) |
| |
| |
| level2_ids = set(level2_df['FusionGID'].tolist()) |
| level3_ids = set(level3_df['FusionGID'].tolist()) |
| log_update(f"Total overlapping fusionGIDs between levels 2 and 3: {len(level2_ids.intersection(level3_ids))}") |
| |
| |
| links_save_dir = 'raw_data/fusionpdb' |
| os.makedirs(links_save_dir,exist_ok=True) |
| process_ids(level2_ids,outdir=links_save_dir,level=2) |
| |
| |
| process_ids_ht(level2_ids,outdir=links_save_dir,level=2) |
| |
| |
| links_save_dir = 'raw_data/fusionpdb' |
| process_ids(level3_ids,outdir=links_save_dir,level=3) |
| |
| |
| process_ids_ht(level3_ids,outdir=links_save_dir,level=3) |
| |
| |
| ht_df = combine_ht_info() |
| ht_df.to_csv("processed_data/fusionpdb/fusion_heads_and_tails.csv",index=False) |
| |
| |
| log_update("\nCombining level 2 and 3 data") |
| giant_level2 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'}) |
| giant_level2 = pd.merge(giant_level2, level2_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left') |
| log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv: {len(giant_level2)}") |
| |
| giant_level3 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'}) |
| giant_level3 = pd.merge(giant_level3, level3_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left') |
| log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv: {len(giant_level3)}") |
| |
| giant_level2['Level'] = [2]*len(giant_level2) |
| giant_level3['Level'] = [3]*len(giant_level3) |
| |
| |
| |
| |
| |
| |
| giant_sl = pd.concat([giant_level2,giant_level3]).drop_duplicates().reset_index(drop=True) |
| giant_sl.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv',index=False) |
| |
| log_update(f"\nSaving file with all Level 2 and 3 Structure links (size: {len(giant_sl)}) to: processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv") |
| |
| |
| giant_ht = pd.concat([level2_df,level3_df]).reset_index(drop=True) |
| |
| giant_ht['FusionGID'] = giant_ht['FusionGID'].astype(str) |
| giant_with_ht = combine_ht_info_with_structure_links(giant_ht, ht_df) |
| giant_with_ht.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv',index=False) |
| |
| |
| download_structures(giant_sl['Structure Link'].tolist()) |
| |
| def main(): |
| with open_logfile("fetch_fusionpdb_data_log.txt"): |
| scrape_fusionpdb_level_2_3() |
|
|
| if __name__ == "__main__": |
| main() |
| |