Spaces:
Sleeping
Sleeping
Commit
·
f9741db
1
Parent(s):
55f11f5
Update code/pdb_featureVector.py
Browse files
code/pdb_featureVector.py
CHANGED
|
@@ -95,7 +95,7 @@ def pdb(input_set, mode, impute):
|
|
| 95 |
data.domStart = data.domStart.replace({'nan': '-1'})
|
| 96 |
data.domEnd = data.domEnd.replace({'nan': '-1'})
|
| 97 |
data.distance = data.distance.replace({'nan': '-1'})
|
| 98 |
-
|
| 99 |
"""
|
| 100 |
STEP 4
|
| 101 |
Retrieve canonical and isoform UniProt sequences.
|
|
@@ -197,6 +197,7 @@ def pdb(input_set, mode, impute):
|
|
| 197 |
else:
|
| 198 |
pdbs = []
|
| 199 |
print('Processing PDB structures...\n')
|
|
|
|
| 200 |
if pdbs == []:
|
| 201 |
print('No PDB structure found for the query. ')
|
| 202 |
print('Starting PDB structures download...\n')
|
|
@@ -297,7 +298,7 @@ def pdb(input_set, mode, impute):
|
|
| 297 |
filename.rename(filename_replace_ext.with_suffix('.pdb'))
|
| 298 |
except:
|
| 299 |
FileNotFoundError
|
| 300 |
-
|
| 301 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
| 302 |
uniprot_matched = uniprot_matched.astype(str)
|
| 303 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
|
@@ -402,7 +403,7 @@ def pdb(input_set, mode, impute):
|
|
| 402 |
dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
| 403 |
dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
|
| 404 |
dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True)
|
| 405 |
-
|
| 406 |
dfM = dfM.astype(str)
|
| 407 |
dfNM = dfNM.astype(str)
|
| 408 |
|
|
@@ -493,7 +494,7 @@ def pdb(input_set, mode, impute):
|
|
| 493 |
|
| 494 |
print('Proceeding to SwissModel search...')
|
| 495 |
print('------------------------------------\n')
|
| 496 |
-
|
| 497 |
# At this point we have 4 dataframes
|
| 498 |
# 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well.
|
| 499 |
# 1a. aligned --- we are done with this.
|
|
@@ -592,7 +593,7 @@ def pdb(input_set, mode, impute):
|
|
| 592 |
|
| 593 |
with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False)
|
| 594 |
with_swiss_models = with_swiss_models[to_swiss.columns]
|
| 595 |
-
|
| 596 |
# Add model info.
|
| 597 |
|
| 598 |
with_swiss_models = with_swiss_models.astype(str)
|
|
@@ -698,7 +699,7 @@ def pdb(input_set, mode, impute):
|
|
| 698 |
swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'])
|
| 699 |
else:
|
| 700 |
swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']
|
| 701 |
-
|
| 702 |
swissmodels_fasta = swissmodels_fasta.astype(str)
|
| 703 |
|
| 704 |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float)
|
|
@@ -813,7 +814,7 @@ def pdb(input_set, mode, impute):
|
|
| 813 |
to_swiss_columns = to_swiss.columns
|
| 814 |
to_swiss_size = len(to_swiss.drop_duplicates(['datapoint']))
|
| 815 |
to_swiss = None
|
| 816 |
-
|
| 817 |
# CONTROL
|
| 818 |
|
| 819 |
"""
|
|
|
|
| 95 |
data.domStart = data.domStart.replace({'nan': '-1'})
|
| 96 |
data.domEnd = data.domEnd.replace({'nan': '-1'})
|
| 97 |
data.distance = data.distance.replace({'nan': '-1'})
|
| 98 |
+
st.write('1')
|
| 99 |
"""
|
| 100 |
STEP 4
|
| 101 |
Retrieve canonical and isoform UniProt sequences.
|
|
|
|
| 197 |
else:
|
| 198 |
pdbs = []
|
| 199 |
print('Processing PDB structures...\n')
|
| 200 |
+
st.write('2')
|
| 201 |
if pdbs == []:
|
| 202 |
print('No PDB structure found for the query. ')
|
| 203 |
print('Starting PDB structures download...\n')
|
|
|
|
| 298 |
filename.rename(filename_replace_ext.with_suffix('.pdb'))
|
| 299 |
except:
|
| 300 |
FileNotFoundError
|
| 301 |
+
st.write('3')
|
| 302 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
| 303 |
uniprot_matched = uniprot_matched.astype(str)
|
| 304 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
|
|
|
| 403 |
dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
| 404 |
dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
|
| 405 |
dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True)
|
| 406 |
+
st.write('4')
|
| 407 |
dfM = dfM.astype(str)
|
| 408 |
dfNM = dfNM.astype(str)
|
| 409 |
|
|
|
|
| 494 |
|
| 495 |
print('Proceeding to SwissModel search...')
|
| 496 |
print('------------------------------------\n')
|
| 497 |
+
st.write('5')
|
| 498 |
# At this point we have 4 dataframes
|
| 499 |
# 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well.
|
| 500 |
# 1a. aligned --- we are done with this.
|
|
|
|
| 593 |
|
| 594 |
with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False)
|
| 595 |
with_swiss_models = with_swiss_models[to_swiss.columns]
|
| 596 |
+
st.write('6')
|
| 597 |
# Add model info.
|
| 598 |
|
| 599 |
with_swiss_models = with_swiss_models.astype(str)
|
|
|
|
| 699 |
swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'])
|
| 700 |
else:
|
| 701 |
swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']
|
| 702 |
+
st.write('7')
|
| 703 |
swissmodels_fasta = swissmodels_fasta.astype(str)
|
| 704 |
|
| 705 |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float)
|
|
|
|
| 814 |
to_swiss_columns = to_swiss.columns
|
| 815 |
to_swiss_size = len(to_swiss.drop_duplicates(['datapoint']))
|
| 816 |
to_swiss = None
|
| 817 |
+
st.write('8')
|
| 818 |
# CONTROL
|
| 819 |
|
| 820 |
"""
|