Spaces:
Sleeping
Sleeping
Commit
·
771222f
1
Parent(s):
f9b9a8c
Update code/modbaseModelAdd.py
Browse files- code/modbaseModelAdd.py +0 -149
code/modbaseModelAdd.py
CHANGED
|
@@ -1,152 +1,3 @@
|
|
| 1 |
-
# import requests
|
| 2 |
-
# import numpy as np
|
| 3 |
-
# import pandas as pd
|
| 4 |
-
# from utils import *
|
| 5 |
-
# from pathlib import Path
|
| 6 |
-
# from bs4 import BeautifulSoup
|
| 7 |
-
# from add_sasa import *
|
| 8 |
-
# def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
|
| 9 |
-
# if len(dataframe) != 0:
|
| 10 |
-
# # GET MODBASE MODELS
|
| 11 |
-
# # Get IDs from data to retrieve only their models from MODBASE
|
| 12 |
-
# dataframe.reset_index(inplace=True, drop=True)
|
| 13 |
-
# existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*"))
|
| 14 |
-
# existing_modbase_models = [str(i) for i in existing_modbase_models]
|
| 15 |
-
# existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models]
|
| 16 |
-
|
| 17 |
-
# existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*"))
|
| 18 |
-
# existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind]
|
| 19 |
-
# existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind]
|
| 20 |
-
|
| 21 |
-
# modbase_reduced = pd.DataFrame(columns = ['uniprotID', 'target_begin', 'target_end', 'quality_score',
|
| 22 |
-
# 'model_id', 'coordinates','AAonPDB', 'coordVAR'])
|
| 23 |
-
# print('Retrieving ModBase models...\n')
|
| 24 |
-
# modbase = pd.DataFrame(
|
| 25 |
-
# columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
|
| 26 |
-
# 'coordinates', 'AAonPDB', 'coordVAR'])
|
| 27 |
-
# no_modbase = pd.DataFrame(
|
| 28 |
-
# columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
|
| 29 |
-
# 'coordinates', 'AAonPDB', 'coordVAR'])
|
| 30 |
-
# # Get model files associated with each UniProtID
|
| 31 |
-
# existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
|
| 32 |
-
# existing_free_sasa = [str(i) for i in existing_free_sasa]
|
| 33 |
-
# existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
| 34 |
-
# keep_cols = dataframe.columns
|
| 35 |
-
# for i in dataframe.index:
|
| 36 |
-
# coordDict = {}
|
| 37 |
-
# protein = dataframe.at[i, 'uniprotID']
|
| 38 |
-
# varPos = int(dataframe.at[i, 'pos'])
|
| 39 |
-
# wt = dataframe.at[i, 'wt']
|
| 40 |
-
# mut = dataframe.at[i, 'mut']
|
| 41 |
-
# datapoint = dataframe.at[i, 'datapoint']
|
| 42 |
-
|
| 43 |
-
# if protein not in existing_modbase_models:
|
| 44 |
-
# print('Downloading Modbase models for ', protein)
|
| 45 |
-
# url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
|
| 46 |
-
# req = requests.get(url)
|
| 47 |
-
# name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
|
| 48 |
-
# with open(name, 'wb') as f:
|
| 49 |
-
# f.write(req.content)
|
| 50 |
-
# else:
|
| 51 |
-
# print('Model exists for', protein)
|
| 52 |
-
# name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt')
|
| 53 |
-
|
| 54 |
-
# with open(name, encoding="utf8") as f:
|
| 55 |
-
# a = open(name, 'r').read()
|
| 56 |
-
# soup = BeautifulSoup(a, 'lxml')
|
| 57 |
-
# if soup.findAll('pdbfile') != []:
|
| 58 |
-
# for pdb in soup.findAll('pdbfile'):
|
| 59 |
-
# model_id = str(pdb.contents[1])[10:-11]
|
| 60 |
-
# if model_id not in existing_modbase_models_ind:
|
| 61 |
-
# with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w', encoding="utf8") as individual:
|
| 62 |
-
# individual.write(str('UniProt ID: ' + protein))
|
| 63 |
-
# individual.write('\n')
|
| 64 |
-
# individual.write(str(pdb.contents[3])[10:-11].strip())
|
| 65 |
-
# run_freesasa(
|
| 66 |
-
# Path(path_to_output_files / 'modbase_structures_individual' / f'{model_id.lower()}.txt'),
|
| 67 |
-
# Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt'),
|
| 68 |
-
# include_hetatms=True,
|
| 69 |
-
# outdir=None, force_rerun=False, file_type='pdb')
|
| 70 |
-
# filename = Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt')
|
| 71 |
-
# st.write('filename', filename)
|
| 72 |
-
# st.write('varPos', varPos)
|
| 73 |
-
# st.write('wt', wt)
|
| 74 |
-
# st.write('protein', protein)
|
| 75 |
-
# st.write('path_to_output_files', path_to_output_files)
|
| 76 |
-
# dataframe.loc[i, 'sasa'] = sasa(protein, varPos, wt, 1, filename, path_to_output_files, file_type='pdb')
|
| 77 |
-
# st.write('sasa', dataframe.loc[i, 'sasa'] )
|
| 78 |
-
# st.write('model_id', model_id)
|
| 79 |
-
# with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m:
|
| 80 |
-
|
| 81 |
-
# lines = m.readlines()
|
| 82 |
-
# quality_score = -999
|
| 83 |
-
# for ind_line in lines:
|
| 84 |
-
# if ind_line[0:10] == 'UniProt ID':
|
| 85 |
-
# uniprot_id = ind_line.split(':')[1].strip()
|
| 86 |
-
# if ind_line[0:23] == 'REMARK 220 TARGET BEGIN':
|
| 87 |
-
# target_begin = ind_line[40:43].strip()
|
| 88 |
-
# if ind_line[0:21] == 'REMARK 220 TARGET END':
|
| 89 |
-
# target_end = ind_line[40:43].strip()
|
| 90 |
-
# coordDict, AAonPDB, coordVAR = {},np.NaN,np.NaN
|
| 91 |
-
# if (int(varPos) > int(target_begin)) & (int(varPos) < int(target_end)):
|
| 92 |
-
# coordDict = {}
|
| 93 |
-
# for ind_line in lines:
|
| 94 |
-
# if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID':
|
| 95 |
-
# model_id = ind_line[40:].strip()
|
| 96 |
-
# if ind_line[0:15].strip() == 'REMARK 220 MPQS':
|
| 97 |
-
# quality_score = ind_line[40:].strip()
|
| 98 |
-
# if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA':
|
| 99 |
-
# position = int(ind_line[22:26].strip())
|
| 100 |
-
# chain = ind_line[20:22].strip()
|
| 101 |
-
# aminoacid = threeToOne(ind_line[17:20])
|
| 102 |
-
# coords = [ind_line[31:38].strip(), ind_line[39:46].strip(), ind_line[47:54].strip()]
|
| 103 |
-
# coordDict[position] = coords
|
| 104 |
-
# if position == int(varPos):
|
| 105 |
-
# AAonPDB = aminoacid
|
| 106 |
-
# coordVAR = str(coords)
|
| 107 |
-
# if ind_line[0:3] == 'TER':
|
| 108 |
-
# break
|
| 109 |
-
# try:
|
| 110 |
-
# k = pd.Series(
|
| 111 |
-
# [uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR])
|
| 112 |
-
# new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin,
|
| 113 |
-
# 'target_end': target_end, 'quality_score': quality_score,
|
| 114 |
-
# 'model_id': model_id, 'coordinates': coordDict,
|
| 115 |
-
# 'AAonPDB': AAonPDB, 'coordVAR': coordVAR}
|
| 116 |
-
# modbase_reduced = modbase_reduced.append(new_row, ignore_index=True)
|
| 117 |
-
# modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR']]
|
| 118 |
-
# modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left')
|
| 119 |
-
# modbase.quality_score = modbase.quality_score.astype(float)
|
| 120 |
-
# modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False)
|
| 121 |
-
# modbase.reset_index(inplace=True, drop=True)
|
| 122 |
-
# modbase.fillna(np.NaN, inplace=True)
|
| 123 |
-
# modbase.replace({'\'?\', ': '',
|
| 124 |
-
# ', \'?\'': '',
|
| 125 |
-
# '(': '', ')': '',
|
| 126 |
-
# '[\'?\']': np.NaN,
|
| 127 |
-
# '[]': np.NaN,
|
| 128 |
-
# 'nan-nan': np.NaN,
|
| 129 |
-
# '': np.NaN}, inplace=True)
|
| 130 |
-
# except NameError:
|
| 131 |
-
# print('This file doesnt have Quality Score. Replacer: -999', model_id)
|
| 132 |
-
# else:
|
| 133 |
-
# new_row = {'uniprotID': uniprot_id, 'wt': wt,
|
| 134 |
-
# 'pos': varPos, 'mut': mut, 'datapoint': datapoint }
|
| 135 |
-
# no_modbase = no_modbase.append(new_row, ignore_index=True)
|
| 136 |
-
|
| 137 |
-
# else:
|
| 138 |
-
# new_row = {'uniprotID': uniprot_id, 'wt': wt,
|
| 139 |
-
# 'pos': varPos, 'mut': mut, 'datapoint': datapoint }
|
| 140 |
-
# no_modbase = no_modbase.append(new_row, ignore_index=True)
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
# no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])]
|
| 145 |
-
# no_modbase = pd.concat([no_modbase, no_modbase_no_Coord])
|
| 146 |
-
# modbase = modbase[~pd.isna(modbase['coordVAR'])]
|
| 147 |
-
# no_modbase = no_modbase[keep_cols]
|
| 148 |
-
# return modbase, no_modbase
|
| 149 |
-
|
| 150 |
import requests
|
| 151 |
import numpy as np
|
| 152 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import requests
|
| 2 |
import numpy as np
|
| 3 |
import pandas as pd
|