Spaces:
Sleeping
Sleeping
Commit
·
8d9c11e
1
Parent(s):
debd6c0
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +209 -167
code/pdb_featureVector.py
CHANGED
|
@@ -1,3 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# IMPORT NECESSARY MODULES AND LIBRARIES
|
| 2 |
from timeit import default_timer as timer
|
| 3 |
import xml.etree.ElementTree as ET
|
|
@@ -25,13 +75,13 @@ from Bio.PDB import PDBList
|
|
| 25 |
from Bio import Align
|
| 26 |
from Bio import SeqIO
|
| 27 |
from Bio.PDB import *
|
|
|
|
| 28 |
warnings.filterwarnings("ignore")
|
| 29 |
start = timer()
|
| 30 |
import streamlit as st
|
| 31 |
# FUNCTIONS
|
| 32 |
|
| 33 |
|
| 34 |
-
|
| 35 |
# FUNCTIONS
|
| 36 |
from calc_pc_property import *
|
| 37 |
from add_domains import *
|
|
@@ -57,14 +107,16 @@ def pdb(input_set, mode, impute):
|
|
| 57 |
Add datapoint identifier and remove non-standard input.
|
| 58 |
"""
|
| 59 |
data = clean_data(input_set)
|
| 60 |
-
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer =
|
|
|
|
| 61 |
out_path = path_to_output_files / 'log.txt'
|
| 62 |
sys.stdout = open(out_path, 'w')
|
| 63 |
print('Creating directories...')
|
| 64 |
|
| 65 |
annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
| 66 |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
|
| 67 |
-
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
|
|
|
|
| 68 |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
|
| 69 |
'transitPeptide', 'glycosylation', 'propeptide']
|
| 70 |
|
|
@@ -139,12 +191,14 @@ def pdb(input_set, mode, impute):
|
|
| 139 |
if wt == can:
|
| 140 |
data.at[i, 'wt_sequence_match'] = 'm'
|
| 141 |
elif wt != can:
|
| 142 |
-
isoList = isoform_fasta[
|
|
|
|
| 143 |
for k in isoList:
|
| 144 |
if len(k) >= int(data.at[i, 'pos']):
|
| 145 |
resInIso = k[int(int(data.at[i, 'pos']) - 1)]
|
| 146 |
if wt == resInIso:
|
| 147 |
-
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[
|
|
|
|
| 148 |
data.at[i, 'wt_sequence_match'] = 'i'
|
| 149 |
data.at[i, 'whichIsoform'] = whichIsoform
|
| 150 |
break
|
|
@@ -189,13 +243,13 @@ def pdb(input_set, mode, impute):
|
|
| 189 |
for prot in protein:
|
| 190 |
pdbs.append(get_pdb_ids(prot))
|
| 191 |
print('PDBs', pdbs)
|
| 192 |
-
if len(pdbs)>=1:
|
| 193 |
print('pdbs not empty')
|
| 194 |
pdbs = [item for sublist in pdbs for item in sublist]
|
| 195 |
print('NEW', pdbs)
|
| 196 |
else:
|
| 197 |
print('pdbs empty')
|
| 198 |
-
pdbs =[]
|
| 199 |
print('Processing PDB structures...\n')
|
| 200 |
if pdbs == []:
|
| 201 |
print('No PDB structure found for the query. ')
|
|
@@ -218,8 +272,8 @@ def pdb(input_set, mode, impute):
|
|
| 218 |
try:
|
| 219 |
shutil.rmtree('obsolete')
|
| 220 |
except OSError as e:
|
| 221 |
-
pass
|
| 222 |
-
existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
|
| 223 |
st.write('existing_pdb')
|
| 224 |
st.write(existing_pdb)
|
| 225 |
existing_pdb = [str(i) for i in existing_pdb]
|
|
@@ -229,28 +283,15 @@ def pdb(input_set, mode, impute):
|
|
| 229 |
for search in pdbs:
|
| 230 |
st.write('searching for pdb:', search)
|
| 231 |
try:
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
st.write('after download:', existing_pdb)
|
| 242 |
-
else:
|
| 243 |
-
print('PDB structure file exists..')
|
| 244 |
-
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
| 245 |
-
filename_replace_ext = filename.with_suffix(".pdb")
|
| 246 |
-
filename.rename(filename_replace_ext)
|
| 247 |
-
|
| 248 |
-
file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
|
| 249 |
-
|
| 250 |
-
base = os.path.splitext(str(file))[0]
|
| 251 |
-
base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
|
| 252 |
-
os.rename(file, base + ".ent")
|
| 253 |
-
file = base + '.ent'
|
| 254 |
|
| 255 |
resolution_method = parser.get_structure(search, file)
|
| 256 |
for record in SeqIO.parse(file, "pdb-seqres"):
|
|
@@ -269,7 +310,7 @@ def pdb(input_set, mode, impute):
|
|
| 269 |
pdb_info.at[index, 'pdbID'] = 'nan'
|
| 270 |
pdb_info.at[index, 'chain'] = 'nan'
|
| 271 |
pdb_info.at[index, 'resolution'] = 'nan'
|
| 272 |
-
cnt +=1
|
| 273 |
print()
|
| 274 |
print('PDB file processing finished..')
|
| 275 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
|
@@ -323,13 +364,11 @@ def pdb(input_set, mode, impute):
|
|
| 323 |
TypeError
|
| 324 |
with_pdb.at[i, 'pdbInfo'] = 'nan'
|
| 325 |
|
| 326 |
-
with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
|
| 327 |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
|
| 328 |
'wt_sequence_match',
|
| 329 |
'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
|
| 330 |
|
| 331 |
-
|
| 332 |
-
|
| 333 |
# If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
|
| 334 |
# If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
|
| 335 |
# If the query data points are found in with_pdb data frame, it will be searched in the following steps.
|
|
@@ -343,7 +382,8 @@ def pdb(input_set, mode, impute):
|
|
| 343 |
if len(with_pdb) > 0:
|
| 344 |
with_pdb = add_annotations(with_pdb)
|
| 345 |
else:
|
| 346 |
-
new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant',
|
|
|
|
| 347 |
'activeSite',
|
| 348 |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
|
| 349 |
'crosslink', 'mutagenesis', 'strand',
|
|
@@ -362,7 +402,7 @@ def pdb(input_set, mode, impute):
|
|
| 362 |
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
|
| 363 |
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
|
| 364 |
'glycosylationBinary', 'propeptideBinary']
|
| 365 |
-
with_pdb = pd.DataFrame(columns
|
| 366 |
try:
|
| 367 |
with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
|
| 368 |
except:
|
|
@@ -374,7 +414,7 @@ def pdb(input_set, mode, impute):
|
|
| 374 |
with_pdb.replace({'[]': 'nan'}, inplace=True)
|
| 375 |
with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
|
| 376 |
with_pdb.replace({'': 'nan'}, inplace=True)
|
| 377 |
-
|
| 378 |
"""
|
| 379 |
STEP 7
|
| 380 |
Do alignment for PDB
|
|
@@ -409,8 +449,7 @@ def pdb(input_set, mode, impute):
|
|
| 409 |
existing_pdb = None
|
| 410 |
with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
|
| 411 |
with_pdb = None
|
| 412 |
-
|
| 413 |
-
|
| 414 |
print('Aligning sequences...\n')
|
| 415 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 416 |
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
|
@@ -433,7 +472,6 @@ def pdb(input_set, mode, impute):
|
|
| 433 |
aligned_m = aligned_m.astype(str)
|
| 434 |
aligned_nm = aligned_nm.astype(str)
|
| 435 |
|
| 436 |
-
|
| 437 |
frames = [aligned_m, aligned_nm]
|
| 438 |
after_up_pdb_alignment = pd.concat(frames, sort=False)
|
| 439 |
if len(after_up_pdb_alignment) == 0:
|
|
@@ -456,7 +494,6 @@ def pdb(input_set, mode, impute):
|
|
| 456 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
| 457 |
no_pdb = no_pdb.copy()
|
| 458 |
|
| 459 |
-
|
| 460 |
print('PDB matching is completed...\n')
|
| 461 |
print('SUMMARY')
|
| 462 |
print('-------')
|
|
@@ -471,7 +508,6 @@ def pdb(input_set, mode, impute):
|
|
| 471 |
print('--%d will be searched in Swiss-Model database.\n' % (
|
| 472 |
len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
|
| 473 |
|
| 474 |
-
|
| 475 |
dfM = None
|
| 476 |
dfNM = None
|
| 477 |
aligned_nm = None
|
|
@@ -527,7 +563,8 @@ def pdb(input_set, mode, impute):
|
|
| 527 |
swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
|
| 528 |
dtype=str, header=None, skiprows=1,
|
| 529 |
names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
|
| 530 |
-
'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean',
|
|
|
|
| 531 |
|
| 532 |
else:
|
| 533 |
swiss_model = pd.DataFrame(
|
|
@@ -547,13 +584,13 @@ def pdb(input_set, mode, impute):
|
|
| 547 |
swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
|
| 548 |
else:
|
| 549 |
swiss_model.at[ind, 'whichIsoform'] = 'nan'
|
| 550 |
-
# swiss_model.drop(['input'], axis=1, inplace=True)
|
| 551 |
swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
|
| 552 |
print('Index File Processed...\n')
|
| 553 |
|
| 554 |
-
|
| 555 |
# Get relevant columns
|
| 556 |
-
swiss_model = swiss_model[
|
|
|
|
| 557 |
# Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
|
| 558 |
swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
|
| 559 |
swiss_model.reset_index(inplace=True)
|
|
@@ -710,7 +747,6 @@ def pdb(input_set, mode, impute):
|
|
| 710 |
ascending=[True, False])
|
| 711 |
swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
|
| 712 |
|
| 713 |
-
|
| 714 |
swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
|
| 715 |
swiss_models_with_data.reset_index(inplace=True)
|
| 716 |
swiss_models_with_data.drop(['index'], axis=1, inplace=True)
|
|
@@ -727,7 +763,6 @@ def pdb(input_set, mode, impute):
|
|
| 727 |
|
| 728 |
swiss_models_with_data = swiss_models_with_data1.copy()
|
| 729 |
|
| 730 |
-
|
| 731 |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
|
| 732 |
swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
|
| 733 |
axis=0, ascending=[True, True, True, False])
|
|
@@ -737,7 +772,8 @@ def pdb(input_set, mode, impute):
|
|
| 737 |
keep='first')
|
| 738 |
swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
|
| 739 |
swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
|
| 740 |
-
len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(
|
|
|
|
| 741 |
no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
|
| 742 |
# This printed data here includes all possible models with different qualities,
|
| 743 |
# because we may get a hit in either of them.
|
|
@@ -764,10 +800,10 @@ def pdb(input_set, mode, impute):
|
|
| 764 |
|
| 765 |
swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
|
| 766 |
swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
|
| 767 |
-
swiss_model_aligned = alignment(swiss_models_with_data, annotation_list,
|
|
|
|
| 768 |
swiss_models_with_data = None
|
| 769 |
|
| 770 |
-
|
| 771 |
if len(swiss_model_aligned) == 0:
|
| 772 |
swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
|
| 773 |
swiss_model_aligned['qmean_norm'] = 'nan'
|
|
@@ -860,7 +896,7 @@ def pdb(input_set, mode, impute):
|
|
| 860 |
url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
|
| 861 |
print(url)
|
| 862 |
req = requests.get(url)
|
| 863 |
-
name = path_to_output_files / 'modbase_structures' /
|
| 864 |
with open(name, 'wb') as f:
|
| 865 |
f.write(req.content)
|
| 866 |
else:
|
|
@@ -877,7 +913,7 @@ def pdb(input_set, mode, impute):
|
|
| 877 |
individual.write(str('UniProt ID: ' + protein))
|
| 878 |
individual.write('\n')
|
| 879 |
individual.write(str(pdb.contents[3])[10:-11].strip())
|
| 880 |
-
with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt',
|
| 881 |
encoding="utf8") as f:
|
| 882 |
fasta = ''
|
| 883 |
chain = ''
|
|
@@ -960,7 +996,6 @@ def pdb(input_set, mode, impute):
|
|
| 960 |
existing_modbase_models = None
|
| 961 |
existing_modbase_models_ind = None
|
| 962 |
|
| 963 |
-
|
| 964 |
model_info_added = model_info_added.drop(['UniprotID'], axis=1)
|
| 965 |
model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
|
| 966 |
'PDBCode': 'template', 'PDBChain': 'chain',
|
|
@@ -1013,7 +1048,8 @@ def pdb(input_set, mode, impute):
|
|
| 1013 |
with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
|
| 1014 |
axis=0,
|
| 1015 |
ascending=[True, True, True, True, False, True, False])
|
| 1016 |
-
with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
|
|
|
|
| 1017 |
|
| 1018 |
with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
|
| 1019 |
with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
|
|
@@ -1027,7 +1063,6 @@ def pdb(input_set, mode, impute):
|
|
| 1027 |
with_modbase_info.reset_index(inplace=True)
|
| 1028 |
with_modbase_info.drop('index', axis=1, inplace=True)
|
| 1029 |
|
| 1030 |
-
|
| 1031 |
align = with_modbase_info[
|
| 1032 |
with_modbase_info.fasta != 'nan']
|
| 1033 |
yes_pdb_no_match = with_modbase_info[
|
|
@@ -1046,7 +1081,6 @@ def pdb(input_set, mode, impute):
|
|
| 1046 |
modbase_aligned = modbase_aligned.astype(str)
|
| 1047 |
modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
|
| 1048 |
|
| 1049 |
-
|
| 1050 |
# Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
|
| 1051 |
if len(with_modbase_info) != 0:
|
| 1052 |
not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
|
|
@@ -1054,29 +1088,30 @@ def pdb(input_set, mode, impute):
|
|
| 1054 |
['datapoint'],
|
| 1055 |
keep=False)
|
| 1056 |
else:
|
| 1057 |
-
not_in_aligned = pd.DataFrame(
|
| 1058 |
-
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
-
|
| 1064 |
-
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
-
|
| 1070 |
-
|
| 1071 |
-
|
| 1072 |
-
|
| 1073 |
-
|
| 1074 |
-
|
| 1075 |
-
|
| 1076 |
-
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
|
|
|
| 1080 |
with_modbase_info = None
|
| 1081 |
if len(not_in_aligned) != 0:
|
| 1082 |
not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
|
|
@@ -1093,7 +1128,8 @@ def pdb(input_set, mode, impute):
|
|
| 1093 |
nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
|
| 1094 |
not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
|
| 1095 |
not_nan.score = not_nan.score.astype(float)
|
| 1096 |
-
not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False],
|
|
|
|
| 1097 |
|
| 1098 |
not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
|
| 1099 |
ascending=[True, True, False])
|
|
@@ -1105,7 +1141,7 @@ def pdb(input_set, mode, impute):
|
|
| 1105 |
which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
|
| 1106 |
if len(which_ones_are_match) == 0:
|
| 1107 |
which_ones_are_match = pd.DataFrame(
|
| 1108 |
-
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
|
| 1109 |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
| 1110 |
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
|
| 1111 |
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
|
@@ -1141,7 +1177,6 @@ def pdb(input_set, mode, impute):
|
|
| 1141 |
not_nan = None
|
| 1142 |
nan = None
|
| 1143 |
|
| 1144 |
-
|
| 1145 |
# merge not_in_align and modbase_not_match as they were both excluded from modbase match.
|
| 1146 |
|
| 1147 |
# No model
|
|
@@ -1170,9 +1205,10 @@ def pdb(input_set, mode, impute):
|
|
| 1170 |
elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
|
| 1171 |
rest = no_info
|
| 1172 |
else:
|
| 1173 |
-
rest = pd.DataFrame(
|
| 1174 |
-
|
| 1175 |
-
|
|
|
|
| 1176 |
|
| 1177 |
rest = rest[to_swiss_columns]
|
| 1178 |
rest = rest.drop_duplicates()
|
|
@@ -1184,49 +1220,53 @@ def pdb(input_set, mode, impute):
|
|
| 1184 |
|
| 1185 |
else:
|
| 1186 |
|
| 1187 |
-
modbase_match = pd.DataFrame(
|
| 1188 |
-
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
-
|
| 1194 |
-
|
| 1195 |
-
|
| 1196 |
-
|
| 1197 |
-
|
| 1198 |
-
|
| 1199 |
-
|
| 1200 |
-
|
| 1201 |
-
|
| 1202 |
-
|
| 1203 |
-
|
| 1204 |
-
|
| 1205 |
-
|
| 1206 |
-
|
| 1207 |
-
|
| 1208 |
-
|
| 1209 |
-
|
| 1210 |
-
|
| 1211 |
-
|
| 1212 |
-
|
| 1213 |
-
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
-
|
| 1219 |
-
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
-
|
| 1225 |
-
|
| 1226 |
-
|
| 1227 |
-
|
| 1228 |
-
|
| 1229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1230 |
|
| 1231 |
rest = rest[to_swiss_columns]
|
| 1232 |
rest = rest.drop_duplicates()
|
|
@@ -1262,7 +1302,6 @@ def pdb(input_set, mode, impute):
|
|
| 1262 |
not_models = None
|
| 1263 |
modbase_not_match = None
|
| 1264 |
|
| 1265 |
-
|
| 1266 |
# Final corrections
|
| 1267 |
|
| 1268 |
# Now 3D alignment.
|
|
@@ -1284,7 +1323,6 @@ def pdb(input_set, mode, impute):
|
|
| 1284 |
|
| 1285 |
# Fix the axes and merge all data.
|
| 1286 |
|
| 1287 |
-
|
| 1288 |
pdb.drop(['pdbInfo'], axis=1, inplace=True)
|
| 1289 |
pdb.rename(columns={'resolution': 'score'}, inplace=True)
|
| 1290 |
swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
|
|
@@ -1297,7 +1335,6 @@ def pdb(input_set, mode, impute):
|
|
| 1297 |
modbase['source'] = 'MODBASE'
|
| 1298 |
data = pd.concat([swiss, modbase, pdb])
|
| 1299 |
|
| 1300 |
-
|
| 1301 |
data.reset_index(inplace=True)
|
| 1302 |
data.drop(['index'], axis=1, inplace=True)
|
| 1303 |
data = data.astype('str')
|
|
@@ -1321,10 +1358,10 @@ def pdb(input_set, mode, impute):
|
|
| 1321 |
for pdbID in pdb_only.pdbID.to_list():
|
| 1322 |
if pdbID not in existing_free_sasa:
|
| 1323 |
(run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
|
| 1324 |
-
Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
|
|
|
|
| 1325 |
outdir=None, force_rerun=False, file_type='pdb'))
|
| 1326 |
|
| 1327 |
-
|
| 1328 |
print('Calculation RSA for SwissModel Files...\n')
|
| 1329 |
swiss_only = data[data.source == 'SWISSMODEL']
|
| 1330 |
swiss_dp = []
|
|
@@ -1342,7 +1379,8 @@ def pdb(input_set, mode, impute):
|
|
| 1342 |
for pdbID in modbase_only.pdbID.to_list():
|
| 1343 |
if pdbID not in existing_free_sasa:
|
| 1344 |
(run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
|
| 1345 |
-
Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
|
|
|
|
| 1346 |
outdir=None, force_rerun=False, file_type='pdb'))
|
| 1347 |
|
| 1348 |
# This annotation list is different than the prev one, keep it.
|
|
@@ -1380,16 +1418,18 @@ def pdb(input_set, mode, impute):
|
|
| 1380 |
chain = data.at[i, 'chain']
|
| 1381 |
uniprotID = data.at[i, 'uniprotID']
|
| 1382 |
pdbID = data.at[i, 'pdbID']
|
| 1383 |
-
alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode,
|
|
|
|
| 1384 |
mutPos = data.at[i, 'mutationPositionOnPDB']
|
| 1385 |
try:
|
| 1386 |
-
coordMut = get_coords(mutPos, alignments
|
| 1387 |
except:
|
| 1388 |
ValueError
|
| 1389 |
coordMut = 'nan'
|
| 1390 |
try:
|
| 1391 |
sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
|
| 1392 |
-
data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
|
|
|
|
| 1393 |
except:
|
| 1394 |
ValueError
|
| 1395 |
data.at[i, 'sasa'] = 'nan' # mutation position is nan
|
|
@@ -1437,11 +1477,9 @@ def pdb(input_set, mode, impute):
|
|
| 1437 |
data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
|
| 1438 |
float(data.at[i, 'domainEndonPDB']))
|
| 1439 |
|
| 1440 |
-
|
| 1441 |
data = data.astype(str)
|
| 1442 |
data.replace({'NaN': 'nan'}, inplace=True)
|
| 1443 |
|
| 1444 |
-
|
| 1445 |
# Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
|
| 1446 |
|
| 1447 |
# Get interface positions from ECLAIR. Download HQ human
|
|
@@ -1462,28 +1500,29 @@ def pdb(input_set, mode, impute):
|
|
| 1462 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
| 1463 |
|
| 1464 |
if len(data) == 0:
|
| 1465 |
-
data = pd.DataFrame(
|
| 1466 |
-
|
| 1467 |
-
|
| 1468 |
-
|
| 1469 |
-
|
| 1470 |
-
|
| 1471 |
-
|
| 1472 |
-
|
| 1473 |
-
|
| 1474 |
-
|
| 1475 |
-
|
| 1476 |
-
|
| 1477 |
-
|
| 1478 |
-
|
| 1479 |
-
|
| 1480 |
-
|
| 1481 |
-
|
| 1482 |
-
|
| 1483 |
-
|
| 1484 |
-
|
| 1485 |
-
|
| 1486 |
-
|
|
|
|
| 1487 |
else:
|
| 1488 |
data.sasa = data.sasa.astype('str')
|
| 1489 |
|
|
@@ -1522,7 +1561,6 @@ def pdb(input_set, mode, impute):
|
|
| 1522 |
|
| 1523 |
data.drop(['positions'], axis=1, inplace=True)
|
| 1524 |
|
| 1525 |
-
|
| 1526 |
# OPTIONAL
|
| 1527 |
# DOMAIN SELECTION
|
| 1528 |
# Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
|
|
@@ -1541,7 +1579,8 @@ def pdb(input_set, mode, impute):
|
|
| 1541 |
# nan--> 0, 0 -->1 and 1 -->2
|
| 1542 |
|
| 1543 |
print('Final adjustments are being done...\n')
|
| 1544 |
-
binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary',
|
|
|
|
| 1545 |
'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
|
| 1546 |
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
|
| 1547 |
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
|
@@ -1643,7 +1682,8 @@ def pdb(input_set, mode, impute):
|
|
| 1643 |
ready = data.copy()
|
| 1644 |
# Imputation
|
| 1645 |
if (impute == 'True') or (impute == 'true') or (impute == True):
|
| 1646 |
-
filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99,
|
|
|
|
| 1647 |
20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
|
| 1648 |
col_index = 0
|
| 1649 |
for col_ in ready.columns[-30:]:
|
|
@@ -1658,7 +1698,8 @@ def pdb(input_set, mode, impute):
|
|
| 1658 |
ready = ready.replace({'nan': np.NaN})
|
| 1659 |
ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
|
| 1660 |
if len(ready) == 0:
|
| 1661 |
-
print(
|
|
|
|
| 1662 |
print(ready)
|
| 1663 |
print('Feature vector successfully created...')
|
| 1664 |
return ready
|
|
@@ -1669,3 +1710,4 @@ def pdb(input_set, mode, impute):
|
|
| 1669 |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
| 1670 |
sys.stdout.close()
|
| 1671 |
return ready
|
|
|
|
|
|
| 1 |
+
Hugging
|
| 2 |
+
Face
|
| 3 |
+
's logo
|
| 4 |
+
Hugging
|
| 5 |
+
Face
|
| 6 |
+
Search
|
| 7 |
+
models, datasets, users...
|
| 8 |
+
Models
|
| 9 |
+
Datasets
|
| 10 |
+
Spaces
|
| 11 |
+
Docs
|
| 12 |
+
Solutions
|
| 13 |
+
Pricing
|
| 14 |
+
|
| 15 |
+
Spaces:
|
| 16 |
+
|
| 17 |
+
HUBioDataLab
|
| 18 |
+
/
|
| 19 |
+
ASCARIS
|
| 20 |
+
|
| 21 |
+
like
|
| 22 |
+
0
|
| 23 |
+
|
| 24 |
+
App
|
| 25 |
+
Files
|
| 26 |
+
Community
|
| 27 |
+
Settings
|
| 28 |
+
ASCARIS
|
| 29 |
+
/
|
| 30 |
+
code
|
| 31 |
+
/
|
| 32 |
+
pdb_featureVector.py
|
| 33 |
+
fatmacankara
|
| 34 |
+
's picture
|
| 35 |
+
fatmacankara
|
| 36 |
+
Update
|
| 37 |
+
code / pdb_featureVector.py
|
| 38 |
+
debd6c0
|
| 39 |
+
less
|
| 40 |
+
than
|
| 41 |
+
a
|
| 42 |
+
minute
|
| 43 |
+
ago
|
| 44 |
+
raw
|
| 45 |
+
history
|
| 46 |
+
blame
|
| 47 |
+
edit
|
| 48 |
+
delete
|
| 49 |
+
96
|
| 50 |
+
kB
|
| 51 |
# IMPORT NECESSARY MODULES AND LIBRARIES
|
| 52 |
from timeit import default_timer as timer
|
| 53 |
import xml.etree.ElementTree as ET
|
|
|
|
| 75 |
from Bio import Align
|
| 76 |
from Bio import SeqIO
|
| 77 |
from Bio.PDB import *
|
| 78 |
+
|
| 79 |
warnings.filterwarnings("ignore")
|
| 80 |
start = timer()
|
| 81 |
import streamlit as st
|
| 82 |
# FUNCTIONS
|
| 83 |
|
| 84 |
|
|
|
|
| 85 |
# FUNCTIONS
|
| 86 |
from calc_pc_property import *
|
| 87 |
from add_domains import *
|
|
|
|
| 107 |
Add datapoint identifier and remove non-standard input.
|
| 108 |
"""
|
| 109 |
data = clean_data(input_set)
|
| 110 |
+
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(
|
| 111 |
+
mode)
|
| 112 |
out_path = path_to_output_files / 'log.txt'
|
| 113 |
sys.stdout = open(out_path, 'w')
|
| 114 |
print('Creating directories...')
|
| 115 |
|
| 116 |
annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
| 117 |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
|
| 118 |
+
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
|
| 119 |
+
'region',
|
| 120 |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
|
| 121 |
'transitPeptide', 'glycosylation', 'propeptide']
|
| 122 |
|
|
|
|
| 191 |
if wt == can:
|
| 192 |
data.at[i, 'wt_sequence_match'] = 'm'
|
| 193 |
elif wt != can:
|
| 194 |
+
isoList = isoform_fasta[
|
| 195 |
+
isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
|
| 196 |
for k in isoList:
|
| 197 |
if len(k) >= int(data.at[i, 'pos']):
|
| 198 |
resInIso = k[int(int(data.at[i, 'pos']) - 1)]
|
| 199 |
if wt == resInIso:
|
| 200 |
+
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[
|
| 201 |
+
0]
|
| 202 |
data.at[i, 'wt_sequence_match'] = 'i'
|
| 203 |
data.at[i, 'whichIsoform'] = whichIsoform
|
| 204 |
break
|
|
|
|
| 243 |
for prot in protein:
|
| 244 |
pdbs.append(get_pdb_ids(prot))
|
| 245 |
print('PDBs', pdbs)
|
| 246 |
+
if len(pdbs) >= 1:
|
| 247 |
print('pdbs not empty')
|
| 248 |
pdbs = [item for sublist in pdbs for item in sublist]
|
| 249 |
print('NEW', pdbs)
|
| 250 |
else:
|
| 251 |
print('pdbs empty')
|
| 252 |
+
pdbs = []
|
| 253 |
print('Processing PDB structures...\n')
|
| 254 |
if pdbs == []:
|
| 255 |
print('No PDB structure found for the query. ')
|
|
|
|
| 272 |
try:
|
| 273 |
shutil.rmtree('obsolete')
|
| 274 |
except OSError as e:
|
| 275 |
+
pass
|
| 276 |
+
existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*"))
|
| 277 |
st.write('existing_pdb')
|
| 278 |
st.write(existing_pdb)
|
| 279 |
existing_pdb = [str(i) for i in existing_pdb]
|
|
|
|
| 283 |
for search in pdbs:
|
| 284 |
st.write('searching for pdb:', search)
|
| 285 |
try:
|
| 286 |
+
path_pdb = 'out_files/pdb/pdb_structures'
|
| 287 |
+
st.write('path for pdb: ', path_pdb)
|
| 288 |
+
file = pdbl.retrieve_pdb_file(search, pdir=path_pdb, file_format="pdb")
|
| 289 |
+
st.write('file: ', file)
|
| 290 |
+
existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*"))
|
| 291 |
+
st.write('after download:', existing_pdb)
|
| 292 |
+
st.write(Path(path_to_output_files / 'pdb_structures') , path_pdb)
|
| 293 |
+
existing_pdb = list(path_pdb.glob("*"))
|
| 294 |
+
st.write('after download:', existing_pdb)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
resolution_method = parser.get_structure(search, file)
|
| 297 |
for record in SeqIO.parse(file, "pdb-seqres"):
|
|
|
|
| 310 |
pdb_info.at[index, 'pdbID'] = 'nan'
|
| 311 |
pdb_info.at[index, 'chain'] = 'nan'
|
| 312 |
pdb_info.at[index, 'resolution'] = 'nan'
|
| 313 |
+
cnt += 1
|
| 314 |
print()
|
| 315 |
print('PDB file processing finished..')
|
| 316 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
|
|
|
| 364 |
TypeError
|
| 365 |
with_pdb.at[i, 'pdbInfo'] = 'nan'
|
| 366 |
|
| 367 |
+
with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
| 368 |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
|
| 369 |
'wt_sequence_match',
|
| 370 |
'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
|
| 371 |
|
|
|
|
|
|
|
| 372 |
# If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
|
| 373 |
# If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
|
| 374 |
# If the query data points are found in with_pdb data frame, it will be searched in the following steps.
|
|
|
|
| 382 |
if len(with_pdb) > 0:
|
| 383 |
with_pdb = add_annotations(with_pdb)
|
| 384 |
else:
|
| 385 |
+
new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant',
|
| 386 |
+
'dnaBinding',
|
| 387 |
'activeSite',
|
| 388 |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
|
| 389 |
'crosslink', 'mutagenesis', 'strand',
|
|
|
|
| 402 |
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
|
| 403 |
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
|
| 404 |
'glycosylationBinary', 'propeptideBinary']
|
| 405 |
+
with_pdb = pd.DataFrame(columns=new_cols)
|
| 406 |
try:
|
| 407 |
with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
|
| 408 |
except:
|
|
|
|
| 414 |
with_pdb.replace({'[]': 'nan'}, inplace=True)
|
| 415 |
with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
|
| 416 |
with_pdb.replace({'': 'nan'}, inplace=True)
|
| 417 |
+
|
| 418 |
"""
|
| 419 |
STEP 7
|
| 420 |
Do alignment for PDB
|
|
|
|
| 449 |
existing_pdb = None
|
| 450 |
with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
|
| 451 |
with_pdb = None
|
| 452 |
+
|
|
|
|
| 453 |
print('Aligning sequences...\n')
|
| 454 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 455 |
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
|
|
|
| 472 |
aligned_m = aligned_m.astype(str)
|
| 473 |
aligned_nm = aligned_nm.astype(str)
|
| 474 |
|
|
|
|
| 475 |
frames = [aligned_m, aligned_nm]
|
| 476 |
after_up_pdb_alignment = pd.concat(frames, sort=False)
|
| 477 |
if len(after_up_pdb_alignment) == 0:
|
|
|
|
| 494 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
| 495 |
no_pdb = no_pdb.copy()
|
| 496 |
|
|
|
|
| 497 |
print('PDB matching is completed...\n')
|
| 498 |
print('SUMMARY')
|
| 499 |
print('-------')
|
|
|
|
| 508 |
print('--%d will be searched in Swiss-Model database.\n' % (
|
| 509 |
len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
|
| 510 |
|
|
|
|
| 511 |
dfM = None
|
| 512 |
dfNM = None
|
| 513 |
aligned_nm = None
|
|
|
|
| 563 |
swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
|
| 564 |
dtype=str, header=None, skiprows=1,
|
| 565 |
names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
|
| 566 |
+
'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean',
|
| 567 |
+
'qmean_norm', 'seqid', 'url'])
|
| 568 |
|
| 569 |
else:
|
| 570 |
swiss_model = pd.DataFrame(
|
|
|
|
| 584 |
swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
|
| 585 |
else:
|
| 586 |
swiss_model.at[ind, 'whichIsoform'] = 'nan'
|
| 587 |
+
# swiss_model.drop(['input'], axis=1, inplace=True)
|
| 588 |
swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
|
| 589 |
print('Index File Processed...\n')
|
| 590 |
|
|
|
|
| 591 |
# Get relevant columns
|
| 592 |
+
swiss_model = swiss_model[
|
| 593 |
+
['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
|
| 594 |
# Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
|
| 595 |
swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
|
| 596 |
swiss_model.reset_index(inplace=True)
|
|
|
|
| 747 |
ascending=[True, False])
|
| 748 |
swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
|
| 749 |
|
|
|
|
| 750 |
swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
|
| 751 |
swiss_models_with_data.reset_index(inplace=True)
|
| 752 |
swiss_models_with_data.drop(['index'], axis=1, inplace=True)
|
|
|
|
| 763 |
|
| 764 |
swiss_models_with_data = swiss_models_with_data1.copy()
|
| 765 |
|
|
|
|
| 766 |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
|
| 767 |
swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
|
| 768 |
axis=0, ascending=[True, True, True, False])
|
|
|
|
| 772 |
keep='first')
|
| 773 |
swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
|
| 774 |
swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
|
| 775 |
+
len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(
|
| 776 |
+
broken_swiss.drop_duplicates(['datapoint'])) + len(
|
| 777 |
no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
|
| 778 |
# This printed data here includes all possible models with different qualities,
|
| 779 |
# because we may get a hit in either of them.
|
|
|
|
| 800 |
|
| 801 |
swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
|
| 802 |
swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
|
| 803 |
+
swiss_model_aligned = alignment(swiss_models_with_data, annotation_list,
|
| 804 |
+
path_to_output_files / 'alignment_files')
|
| 805 |
swiss_models_with_data = None
|
| 806 |
|
|
|
|
| 807 |
if len(swiss_model_aligned) == 0:
|
| 808 |
swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
|
| 809 |
swiss_model_aligned['qmean_norm'] = 'nan'
|
|
|
|
| 896 |
url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
|
| 897 |
print(url)
|
| 898 |
req = requests.get(url)
|
| 899 |
+
name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
|
| 900 |
with open(name, 'wb') as f:
|
| 901 |
f.write(req.content)
|
| 902 |
else:
|
|
|
|
| 913 |
individual.write(str('UniProt ID: ' + protein))
|
| 914 |
individual.write('\n')
|
| 915 |
individual.write(str(pdb.contents[3])[10:-11].strip())
|
| 916 |
+
with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt',
|
| 917 |
encoding="utf8") as f:
|
| 918 |
fasta = ''
|
| 919 |
chain = ''
|
|
|
|
| 996 |
existing_modbase_models = None
|
| 997 |
existing_modbase_models_ind = None
|
| 998 |
|
|
|
|
| 999 |
model_info_added = model_info_added.drop(['UniprotID'], axis=1)
|
| 1000 |
model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
|
| 1001 |
'PDBCode': 'template', 'PDBChain': 'chain',
|
|
|
|
| 1048 |
with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
|
| 1049 |
axis=0,
|
| 1050 |
ascending=[True, True, True, True, False, True, False])
|
| 1051 |
+
with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
|
| 1052 |
+
keep='first')
|
| 1053 |
|
| 1054 |
with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
|
| 1055 |
with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
|
|
|
|
| 1063 |
with_modbase_info.reset_index(inplace=True)
|
| 1064 |
with_modbase_info.drop('index', axis=1, inplace=True)
|
| 1065 |
|
|
|
|
| 1066 |
align = with_modbase_info[
|
| 1067 |
with_modbase_info.fasta != 'nan']
|
| 1068 |
yes_pdb_no_match = with_modbase_info[
|
|
|
|
| 1081 |
modbase_aligned = modbase_aligned.astype(str)
|
| 1082 |
modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
|
| 1083 |
|
|
|
|
| 1084 |
# Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
|
| 1085 |
if len(with_modbase_info) != 0:
|
| 1086 |
not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
|
|
|
|
| 1088 |
['datapoint'],
|
| 1089 |
keep=False)
|
| 1090 |
else:
|
| 1091 |
+
not_in_aligned = pd.DataFrame(
|
| 1092 |
+
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
| 1093 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
| 1094 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
|
| 1095 |
+
'intMet',
|
| 1096 |
+
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
| 1097 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
|
| 1098 |
+
'crosslink',
|
| 1099 |
+
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
| 1100 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
| 1101 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
|
| 1102 |
+
'coiledCoil',
|
| 1103 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
|
| 1104 |
+
'disulfide',
|
| 1105 |
+
'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
|
| 1106 |
+
'activeSite',
|
| 1107 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
|
| 1108 |
+
'crosslink',
|
| 1109 |
+
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
| 1110 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
| 1111 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
|
| 1112 |
+
'coiledCoil',
|
| 1113 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
|
| 1114 |
+
'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
|
| 1115 |
with_modbase_info = None
|
| 1116 |
if len(not_in_aligned) != 0:
|
| 1117 |
not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
|
|
|
|
| 1128 |
nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
|
| 1129 |
not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
|
| 1130 |
not_nan.score = not_nan.score.astype(float)
|
| 1131 |
+
not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False],
|
| 1132 |
+
inplace=True)
|
| 1133 |
|
| 1134 |
not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
|
| 1135 |
ascending=[True, True, False])
|
|
|
|
| 1141 |
which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
|
| 1142 |
if len(which_ones_are_match) == 0:
|
| 1143 |
which_ones_are_match = pd.DataFrame(
|
| 1144 |
+
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
| 1145 |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
| 1146 |
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
|
| 1147 |
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
|
|
|
| 1177 |
not_nan = None
|
| 1178 |
nan = None
|
| 1179 |
|
|
|
|
| 1180 |
# merge not_in_align and modbase_not_match as they were both excluded from modbase match.
|
| 1181 |
|
| 1182 |
# No model
|
|
|
|
| 1205 |
elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
|
| 1206 |
rest = no_info
|
| 1207 |
else:
|
| 1208 |
+
rest = pd.DataFrame(
|
| 1209 |
+
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
| 1210 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
| 1211 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint'])
|
| 1212 |
|
| 1213 |
rest = rest[to_swiss_columns]
|
| 1214 |
rest = rest.drop_duplicates()
|
|
|
|
| 1220 |
|
| 1221 |
else:
|
| 1222 |
|
| 1223 |
+
modbase_match = pd.DataFrame(
|
| 1224 |
+
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
| 1225 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
| 1226 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
|
| 1227 |
+
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
| 1228 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
|
| 1229 |
+
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
| 1230 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
| 1231 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
|
| 1232 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
|
| 1233 |
+
'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
|
| 1234 |
+
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
|
| 1235 |
+
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
|
| 1236 |
+
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
|
| 1237 |
+
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
| 1238 |
+
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
|
| 1239 |
+
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
|
| 1240 |
+
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
|
| 1241 |
+
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
|
| 1242 |
+
'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
|
| 1243 |
+
'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
|
| 1244 |
+
'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
|
| 1245 |
+
not_in_aligned = pd.DataFrame(
|
| 1246 |
+
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
| 1247 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
| 1248 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
|
| 1249 |
+
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
| 1250 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
|
| 1251 |
+
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
| 1252 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
| 1253 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
|
| 1254 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
|
| 1255 |
+
'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
| 1256 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
|
| 1257 |
+
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
| 1258 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
| 1259 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
|
| 1260 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
|
| 1261 |
+
'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
|
| 1262 |
+
no_info = pd.DataFrame(
|
| 1263 |
+
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
| 1264 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
| 1265 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint'])
|
| 1266 |
+
rest = pd.DataFrame(
|
| 1267 |
+
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
| 1268 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
| 1269 |
+
'wt_sequence_match', 'whichIsoform', 'datapoint'])
|
| 1270 |
|
| 1271 |
rest = rest[to_swiss_columns]
|
| 1272 |
rest = rest.drop_duplicates()
|
|
|
|
| 1302 |
not_models = None
|
| 1303 |
modbase_not_match = None
|
| 1304 |
|
|
|
|
| 1305 |
# Final corrections
|
| 1306 |
|
| 1307 |
# Now 3D alignment.
|
|
|
|
| 1323 |
|
| 1324 |
# Fix the axes and merge all data.
|
| 1325 |
|
|
|
|
| 1326 |
pdb.drop(['pdbInfo'], axis=1, inplace=True)
|
| 1327 |
pdb.rename(columns={'resolution': 'score'}, inplace=True)
|
| 1328 |
swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
|
|
|
|
| 1335 |
modbase['source'] = 'MODBASE'
|
| 1336 |
data = pd.concat([swiss, modbase, pdb])
|
| 1337 |
|
|
|
|
| 1338 |
data.reset_index(inplace=True)
|
| 1339 |
data.drop(['index'], axis=1, inplace=True)
|
| 1340 |
data = data.astype('str')
|
|
|
|
| 1358 |
for pdbID in pdb_only.pdbID.to_list():
|
| 1359 |
if pdbID not in existing_free_sasa:
|
| 1360 |
(run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
|
| 1361 |
+
Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
|
| 1362 |
+
include_hetatms=True,
|
| 1363 |
outdir=None, force_rerun=False, file_type='pdb'))
|
| 1364 |
|
|
|
|
| 1365 |
print('Calculation RSA for SwissModel Files...\n')
|
| 1366 |
swiss_only = data[data.source == 'SWISSMODEL']
|
| 1367 |
swiss_dp = []
|
|
|
|
| 1379 |
for pdbID in modbase_only.pdbID.to_list():
|
| 1380 |
if pdbID not in existing_free_sasa:
|
| 1381 |
(run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
|
| 1382 |
+
Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
|
| 1383 |
+
include_hetatms=True,
|
| 1384 |
outdir=None, force_rerun=False, file_type='pdb'))
|
| 1385 |
|
| 1386 |
# This annotation list is different than the prev one, keep it.
|
|
|
|
| 1418 |
chain = data.at[i, 'chain']
|
| 1419 |
uniprotID = data.at[i, 'uniprotID']
|
| 1420 |
pdbID = data.at[i, 'pdbID']
|
| 1421 |
+
alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode,
|
| 1422 |
+
Path(path_to_output_files / '3D_alignment'), file_format='gzip')
|
| 1423 |
mutPos = data.at[i, 'mutationPositionOnPDB']
|
| 1424 |
try:
|
| 1425 |
+
coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
|
| 1426 |
except:
|
| 1427 |
ValueError
|
| 1428 |
coordMut = 'nan'
|
| 1429 |
try:
|
| 1430 |
sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
|
| 1431 |
+
data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
|
| 1432 |
+
data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb')
|
| 1433 |
except:
|
| 1434 |
ValueError
|
| 1435 |
data.at[i, 'sasa'] = 'nan' # mutation position is nan
|
|
|
|
| 1477 |
data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
|
| 1478 |
float(data.at[i, 'domainEndonPDB']))
|
| 1479 |
|
|
|
|
| 1480 |
data = data.astype(str)
|
| 1481 |
data.replace({'NaN': 'nan'}, inplace=True)
|
| 1482 |
|
|
|
|
| 1483 |
# Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
|
| 1484 |
|
| 1485 |
# Get interface positions from ECLAIR. Download HQ human
|
|
|
|
| 1500 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
| 1501 |
|
| 1502 |
if len(data) == 0:
|
| 1503 |
+
data = pd.DataFrame(
|
| 1504 |
+
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
| 1505 |
+
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
|
| 1506 |
+
'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
|
| 1507 |
+
'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
|
| 1508 |
+
'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
|
| 1509 |
+
'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
|
| 1510 |
+
'strand', 'helix', 'turn', 'metalBinding', 'repeat',
|
| 1511 |
+
'topologicalDomain', 'caBinding', 'bindingSite', 'region',
|
| 1512 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
|
| 1513 |
+
'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
|
| 1514 |
+
'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
|
| 1515 |
+
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
|
| 1516 |
+
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
|
| 1517 |
+
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
|
| 1518 |
+
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
| 1519 |
+
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
|
| 1520 |
+
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
|
| 1521 |
+
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
|
| 1522 |
+
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
|
| 1523 |
+
'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
|
| 1524 |
+
'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
|
| 1525 |
+
'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
|
| 1526 |
else:
|
| 1527 |
data.sasa = data.sasa.astype('str')
|
| 1528 |
|
|
|
|
| 1561 |
|
| 1562 |
data.drop(['positions'], axis=1, inplace=True)
|
| 1563 |
|
|
|
|
| 1564 |
# OPTIONAL
|
| 1565 |
# DOMAIN SELECTION
|
| 1566 |
# Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
|
|
|
|
| 1579 |
# nan--> 0, 0 -->1 and 1 -->2
|
| 1580 |
|
| 1581 |
print('Final adjustments are being done...\n')
|
| 1582 |
+
binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary',
|
| 1583 |
+
'dnaBindingBinary',
|
| 1584 |
'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
|
| 1585 |
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
|
| 1586 |
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
|
|
|
| 1682 |
ready = data.copy()
|
| 1683 |
# Imputation
|
| 1684 |
if (impute == 'True') or (impute == 'true') or (impute == True):
|
| 1685 |
+
filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99,
|
| 1686 |
+
16.82,
|
| 1687 |
20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
|
| 1688 |
col_index = 0
|
| 1689 |
for col_ in ready.columns[-30:]:
|
|
|
|
| 1698 |
ready = ready.replace({'nan': np.NaN})
|
| 1699 |
ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
|
| 1700 |
if len(ready) == 0:
|
| 1701 |
+
print(
|
| 1702 |
+
'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
|
| 1703 |
print(ready)
|
| 1704 |
print('Feature vector successfully created...')
|
| 1705 |
return ready
|
|
|
|
| 1710 |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
| 1711 |
sys.stdout.close()
|
| 1712 |
return ready
|
| 1713 |
+
|