Spaces:
Sleeping
Sleeping
Commit ·
d955409
1
Parent(s): 26f6c21
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +0 -18
code/pdb_featureVector.py
CHANGED
|
@@ -60,7 +60,6 @@ def pdb(input_set, mode, impute):
|
|
| 60 |
data = clean_data(input_set)
|
| 61 |
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode)
|
| 62 |
out_path = path_to_output_files / 'log.txt'
|
| 63 |
-
st.write(out_path)
|
| 64 |
sys.stdout = open(out_path, 'w')
|
| 65 |
print('Creating directories...')
|
| 66 |
|
|
@@ -226,24 +225,18 @@ def pdb(input_set, mode, impute):
|
|
| 226 |
existing_pdb = [str(i) for i in existing_pdb]
|
| 227 |
existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
|
| 228 |
cnt = 0
|
| 229 |
-
st.write('existing_pdb', existing_pdb)
|
| 230 |
for search in pdbs:
|
| 231 |
-
st.write('PDBS', search)
|
| 232 |
|
| 233 |
try:
|
| 234 |
if search.lower() not in existing_pdb:
|
| 235 |
-
st.write(Path(path_to_output_files / 'pdb_structures'))
|
| 236 |
file = pdbl.retrieve_pdb_file(search, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb")
|
| 237 |
-
st.write(file)
|
| 238 |
else:
|
| 239 |
print('PDB structure file exists..')
|
| 240 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
| 241 |
-
st.write('filename', filename)
|
| 242 |
filename_replace_ext = filename.with_suffix(".pdb")
|
| 243 |
filename.rename(filename_replace_ext)
|
| 244 |
|
| 245 |
file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
|
| 246 |
-
st.write('file', file)
|
| 247 |
|
| 248 |
base = os.path.splitext(str(file))[0]
|
| 249 |
base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
|
|
@@ -253,7 +246,6 @@ def pdb(input_set, mode, impute):
|
|
| 253 |
resolution_method = parser.get_structure(search, file)
|
| 254 |
for record in SeqIO.parse(file, "pdb-seqres"):
|
| 255 |
if record.dbxrefs[0].split(':')[0] == 'UNP':
|
| 256 |
-
st.write('RECORD', record)
|
| 257 |
pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
|
| 258 |
pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
|
| 259 |
pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
|
|
@@ -263,7 +255,6 @@ def pdb(input_set, mode, impute):
|
|
| 263 |
pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
|
| 264 |
index += 1
|
| 265 |
except:
|
| 266 |
-
st.write('ERROR INDEX')
|
| 267 |
IndexError
|
| 268 |
pdb_info.at[index, 'uniprotID'] = 'nan'
|
| 269 |
pdb_info.at[index, 'pdbID'] = 'nan'
|
|
@@ -288,13 +279,10 @@ def pdb(input_set, mode, impute):
|
|
| 288 |
FileNotFoundError
|
| 289 |
|
| 290 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
| 291 |
-
st.write('pdb_info', pdb_info)
|
| 292 |
uniprot_matched = uniprot_matched.astype(str)
|
| 293 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
| 294 |
-
st.write('pdb_fasta', pdb_fasta)
|
| 295 |
uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
|
| 296 |
uniprot_matched = uniprot_matched.astype(str)
|
| 297 |
-
st.write('uniprot_matched', uniprot_matched)
|
| 298 |
|
| 299 |
with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
|
| 300 |
(uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
|
|
@@ -304,12 +292,10 @@ def pdb(input_set, mode, impute):
|
|
| 304 |
uniprot_matched.resolution == 'None'))]
|
| 305 |
no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())]
|
| 306 |
no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True)
|
| 307 |
-
st.write('with_pdb', with_pdb)
|
| 308 |
print(
|
| 309 |
'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
|
| 310 |
% (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])),
|
| 311 |
len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint']))))
|
| 312 |
-
st.write('with_pdb1', with_pdb)
|
| 313 |
|
| 314 |
with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
| 315 |
with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
|
|
@@ -388,7 +374,6 @@ def pdb(input_set, mode, impute):
|
|
| 388 |
# Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences.
|
| 389 |
with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C')
|
| 390 |
with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C')
|
| 391 |
-
st.write('with_pdb2', with_pdb)
|
| 392 |
|
| 393 |
dfM = with_pdb[with_pdb.wt_sequence_match == 'm']
|
| 394 |
dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
|
@@ -415,13 +400,11 @@ def pdb(input_set, mode, impute):
|
|
| 415 |
existing_pdb = None
|
| 416 |
with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
|
| 417 |
with_pdb = None
|
| 418 |
-
st.write('dfM', dfM)
|
| 419 |
|
| 420 |
|
| 421 |
print('Aligning sequences...\n')
|
| 422 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 423 |
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 424 |
-
st.write('aligned_m', aligned_m)
|
| 425 |
# When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
|
| 426 |
for i in aligned_m.index:
|
| 427 |
if aligned_m.at[i, 'pdbSequence'] == 'nan':
|
|
@@ -463,7 +446,6 @@ def pdb(input_set, mode, impute):
|
|
| 463 |
yes_pdb_no_match = after_up_pdb_alignment[
|
| 464 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
| 465 |
no_pdb = no_pdb.copy()
|
| 466 |
-
st.write('pdb_aligned', pdb_aligned)
|
| 467 |
|
| 468 |
|
| 469 |
print('PDB matching is completed...\n')
|
|
|
|
| 60 |
data = clean_data(input_set)
|
| 61 |
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode)
|
| 62 |
out_path = path_to_output_files / 'log.txt'
|
|
|
|
| 63 |
sys.stdout = open(out_path, 'w')
|
| 64 |
print('Creating directories...')
|
| 65 |
|
|
|
|
| 225 |
existing_pdb = [str(i) for i in existing_pdb]
|
| 226 |
existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
|
| 227 |
cnt = 0
|
|
|
|
| 228 |
for search in pdbs:
|
|
|
|
| 229 |
|
| 230 |
try:
|
| 231 |
if search.lower() not in existing_pdb:
|
|
|
|
| 232 |
file = pdbl.retrieve_pdb_file(search, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb")
|
|
|
|
| 233 |
else:
|
| 234 |
print('PDB structure file exists..')
|
| 235 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
|
|
|
| 236 |
filename_replace_ext = filename.with_suffix(".pdb")
|
| 237 |
filename.rename(filename_replace_ext)
|
| 238 |
|
| 239 |
file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
|
|
|
|
| 240 |
|
| 241 |
base = os.path.splitext(str(file))[0]
|
| 242 |
base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
|
|
|
|
| 246 |
resolution_method = parser.get_structure(search, file)
|
| 247 |
for record in SeqIO.parse(file, "pdb-seqres"):
|
| 248 |
if record.dbxrefs[0].split(':')[0] == 'UNP':
|
|
|
|
| 249 |
pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
|
| 250 |
pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
|
| 251 |
pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
|
|
|
|
| 255 |
pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
|
| 256 |
index += 1
|
| 257 |
except:
|
|
|
|
| 258 |
IndexError
|
| 259 |
pdb_info.at[index, 'uniprotID'] = 'nan'
|
| 260 |
pdb_info.at[index, 'pdbID'] = 'nan'
|
|
|
|
| 279 |
FileNotFoundError
|
| 280 |
|
| 281 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
|
|
|
| 282 |
uniprot_matched = uniprot_matched.astype(str)
|
| 283 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
|
|
|
| 284 |
uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
|
| 285 |
uniprot_matched = uniprot_matched.astype(str)
|
|
|
|
| 286 |
|
| 287 |
with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
|
| 288 |
(uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
|
|
|
|
| 292 |
uniprot_matched.resolution == 'None'))]
|
| 293 |
no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())]
|
| 294 |
no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True)
|
|
|
|
| 295 |
print(
|
| 296 |
'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
|
| 297 |
% (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])),
|
| 298 |
len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint']))))
|
|
|
|
| 299 |
|
| 300 |
with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
| 301 |
with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
|
|
|
|
| 374 |
# Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences.
|
| 375 |
with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C')
|
| 376 |
with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C')
|
|
|
|
| 377 |
|
| 378 |
dfM = with_pdb[with_pdb.wt_sequence_match == 'm']
|
| 379 |
dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
|
|
|
| 400 |
existing_pdb = None
|
| 401 |
with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
|
| 402 |
with_pdb = None
|
|
|
|
| 403 |
|
| 404 |
|
| 405 |
print('Aligning sequences...\n')
|
| 406 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 407 |
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
|
|
|
| 408 |
# When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
|
| 409 |
for i in aligned_m.index:
|
| 410 |
if aligned_m.at[i, 'pdbSequence'] == 'nan':
|
|
|
|
| 446 |
yes_pdb_no_match = after_up_pdb_alignment[
|
| 447 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
| 448 |
no_pdb = no_pdb.copy()
|
|
|
|
| 449 |
|
| 450 |
|
| 451 |
print('PDB matching is completed...\n')
|