Honzus24 commited on
Commit
c3d5dd7
·
verified ·
1 Parent(s): 18ee2cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -108
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import sys
2
- import os
3
  import gradio as gr
4
  from data.scripts.data_utils import parse_PDB
5
  from utils.utils import ClassConfig, DataCollatorForTokenRegression, process_in_batches_and_combine, get_dot_separated_name
@@ -20,23 +20,24 @@ LOCAL_COMPONENT_PATH = BASE_DIR / "gradio_molecule3d" / "backend"
20
  sys.path.insert(0, str(LOCAL_COMPONENT_PATH))
21
  from gradio_molecule3d.molecule3d import Molecule3D
22
  from Bio.PDB import PDBParser, PDBIO
 
 
 
 
23
 
24
  from data.scripts.data_utils import modify_bfactor_biotite
25
 
26
  def process_pdb_file(pdb_file, backbones, sequences, names):
27
- parsed_name = os.path.splitext(os.path.basename(pdb_file))[0].split('_')
28
- if len(parsed_name[0]) != 4 or len(parsed_name[1]) != 1 or not parsed_name[1].isalpha():
29
- raise ValueError("PDB file name is expected to be in the format of 'name_chain.pdb', e.g.: 1BUI_C.pdb")
30
- _name = parsed_name[0]
31
- _chain = parsed_name[1]
32
  parsed_pdb = parse_PDB(pdb_file, name=_name, input_chain_list=[_chain])[0]
33
  backbone, sequence = parsed_pdb['coords_chain_{}'.format(_chain)], parsed_pdb['seq_chain_{}'.format(_chain)]
34
  if len(sequence) > 1023:
35
- print("Sequence length is greater than 1023, skipping {}".format(_name + "." + _chain))
36
  else:
37
  backbones.append(backbone)
38
  sequences.append(sequence)
39
- names.append(_name + "." + _chain)
40
  return backbones, sequences, names
41
 
42
  def flex_seq(input_seq, input_file):
@@ -44,7 +45,7 @@ def flex_seq(input_seq, input_file):
44
  input_seq = ""
45
 
46
  if not input_seq.strip() and not input_file:
47
- return None, "Provide a file or a input sequence"
48
 
49
  if input_file:
50
  if len(input_file) == 1:
@@ -67,37 +68,25 @@ def flex_seq(input_seq, input_file):
67
  if input_seq:
68
  suffix = ""
69
  proteins = input_seq.split('\n')
70
- for record in proteins:
71
- if ':' in record:
72
- s = record.split(":")
73
- name = s[0]
74
- sequence = s[1]
75
- else:
76
- raise ValueError("Sequence name must contain either an underscore or a dot to separate the PDB code and the chain code.")
77
-
78
- # Normalize name: convert underscore to dot if present
79
- if '_' in name:
80
- name = '.'.join(name.split('_'))
81
- elif '.' in name:
82
- name = name # keep dot as is
83
  else:
84
- raise ValueError("Sequence name must contain either an underscore or a dot to separate the PDB code and the chain code.")
85
 
86
- if datapoint_for_eval == 'all' or name in datapoint_for_eval:
87
  names.append(name)
88
  sequences.append(sequence)
89
  backbones.append(None)
90
 
91
  elif suffix == ".fasta":
92
  for record in SeqIO.parse(input_file, "fasta"):
93
- if '_' in record.name:
94
- dot_separated_name = '.'.join(record.name.split('_'))
95
- elif '.' in record.name:
96
- dot_separated_name = record.name
97
- else:
98
- raise ValueError("Sequence name must contain either an underscore or a dot to separate the PDB code and the chain code.")
99
- if datapoint_for_eval == 'all' or dot_separated_name in datapoint_for_eval:
100
- names.append(dot_separated_name)
101
  sequences.append(str(record.seq))
102
  backbones.append(None)
103
 
@@ -105,29 +94,6 @@ def flex_seq(input_seq, input_file):
105
  backbones, sequences, names = process_pdb_file(input_file, backbones, sequences, names)
106
  pdb_files.append(input_file)
107
 
108
- elif suffix == ".jsonl":
109
- for line in open(input_file, 'r'):
110
- _dict = json.loads(line)
111
-
112
- if 'fluctuations' in _dict.keys():
113
- print("fluctuations are precomputed, using them")
114
- dot_separated_name = get_dot_separated_name(key='pdb_name', _dict=_dict)
115
- if datapoint_for_eval == 'all' or dot_separated_name in datapoint_for_eval:
116
-
117
- names.append(_dict['pdb_name'])
118
- backbones.append(None)
119
- sequences.append(_dict['sequence'])
120
-
121
- flucts_list.append(_dict['fluctuations']+[0.0]) #padding for end cls token
122
- continue
123
-
124
- dot_separated_name = get_dot_separated_name(key='name', _dict=_dict)
125
-
126
- if datapoint_for_eval == 'all' or dot_separated_name in datapoint_for_eval:
127
- backbones.append(_dict['coords'])
128
- sequences.append(_dict['seq'])
129
- names.append(dot_separated_name)
130
-
131
  elif suffix == ".pdb_list":
132
  for i in input_file:
133
  backbones, sequences, names = process_pdb_file(i, backbones, sequences, names)
@@ -142,6 +108,7 @@ def flex_seq(input_seq, input_file):
142
  config = yaml.load(open('configs/train_config.yaml', 'r'), Loader=yaml.FullLoader)
143
  class_config=ClassConfig(config)
144
  class_config.adaptor_architecture = 'no-adaptor'
 
145
  model, tokenizer = PT5_classification_model(half_precision=config['mixed_precision'], class_config=class_config)
146
  model.to(config['inference_args']['device'])
147
  state_dict = torch.load(config['inference_args']['seq_model_path'], map_location=config['inference_args']['device'])
@@ -161,13 +128,6 @@ def flex_seq(input_seq, input_file):
161
  data_collator = DataCollatorForTokenRegression(tokenizer)
162
  batch = data_collator(data_to_collate) # Wrap in list since collator expects batch
163
  batch.to(model.device)
164
- for key in batch.keys():
165
- print("___________-", key, "-___________")
166
- for b in batch[key]:
167
- if key == 'attention_mask':
168
- print(b.sum())
169
- else:
170
- print(b.shape)
171
 
172
  # Predict
173
  with torch.no_grad():
@@ -175,38 +135,35 @@ def flex_seq(input_seq, input_file):
175
  predictions = output_logits[:,:,0] #includes the prediction for the added token
176
  # subselect the predictions using the attention mask
177
 
178
- output_filename = Path(config['inference_args']['prediction_output_dir'].format(output_name, "seq", 'all'))
179
  output_filename.parent.mkdir(parents=True, exist_ok=True)
180
  output_files = []
181
  output_message = "Success"
182
 
183
  for prediction, mask, name, sequence in zip(predictions, batch['attention_mask'], names, sequences):
184
- output_filename_new = output_filename.with_stem("{}_".format(name.replace('.', '_')) + output_filename.stem)
185
  with open(output_filename_new.with_suffix('.txt'), 'w') as f:
186
- f.write("Residue Number Residue ID Flexibility\n")
187
  prediction = prediction[mask.bool()]
188
  if len(prediction) != len(sequence)+1:
189
  print("Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1))
190
 
191
  assert len(prediction) == len(sequence)+1, "Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1)
192
- if '.' in name:
193
- name = name.replace('.', '_')
194
 
195
  p = prediction.tolist()[:-1]
196
  for i in range(len(p)):
197
- f.write(f"{i:<5}{sequence[i]:<20}{round(p[i], 4):<10}\n")
198
  output_files.append(str(output_filename_new.with_suffix('.txt')))
199
 
200
  if suffix == ".pdb" or suffix == ".pdb_list":
201
  for name, pdb_file, prediction in zip(names, pdb_files, predictions):
202
- chain_id = name.split('.')[1]
203
  _prediction = prediction[:-1].reshape(1,-1)
204
- _outname = output_filename.with_name('{}_'.format(name.replace('.', '_')) + output_filename.stem + '.pdb')
205
  print("Saving prediction to {}.".format(_outname))
206
- modify_bfactor_biotite(pdb_file, chain_id, _outname, _prediction) #writing the prediction without the last token
207
  output_files.append(str(_outname))
208
 
209
- _outname = output_filename.with_name(output_filename.stem + '_fasta.fasta')
210
  with open(_outname, 'w') as f:
211
  print("Saving fasta to {}.".format(_outname))
212
  for name, sequence in zip(names, sequences):
@@ -278,6 +235,7 @@ def flex_3d(input_file):
278
  config = yaml.load(open('configs/train_config.yaml', 'r'), Loader=yaml.FullLoader)
279
  class_config=ClassConfig(config)
280
  class_config.adaptor_architecture = 'conv'
 
281
  model, tokenizer = PT5_classification_model(half_precision=config['mixed_precision'], class_config=class_config)
282
 
283
  model.to(config['inference_args']['device'])
@@ -311,13 +269,6 @@ def flex_3d(input_file):
311
 
312
  batch = data_collator(data_to_collate) # Wrap in list since collator expects batch
313
  batch.to(model.device)
314
- for key in batch.keys():
315
- print("___________-", key, "-___________")
316
- for b in batch[key]:
317
- if key == 'attention_mask':
318
- print(b.sum())
319
- else:
320
- print(b.shape)
321
 
322
  # Predict
323
  with torch.no_grad():
@@ -325,32 +276,30 @@ def flex_3d(input_file):
325
  predictions = output_logits[:,:,0] #includes the prediction for the added token
326
  # subselect the predictions using the attention mask
327
 
328
- output_filename = Path(config['inference_args']['prediction_output_dir'].format(output_name, "3D", 'all'))
329
  output_filename.parent.mkdir(parents=True, exist_ok=True)
330
  output_files = []
331
  output_message = "Success"
332
 
333
  for prediction, mask, name, sequence in zip(predictions, batch['attention_mask'], names, sequences):
334
- output_filename_new = output_filename.with_stem("{}_".format(name.replace('.', '_')) + output_filename.stem)
335
  with open(output_filename_new.with_suffix('.txt'), 'w') as f:
336
- f.write("Residue Number Residue ID Flexibility\n")
337
  prediction = prediction[mask.bool()]
338
  if len(prediction) != len(sequence)+1:
339
  print("Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1))
340
 
341
  assert len(prediction) == len(sequence)+1, "Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1)
342
- if '.' in name:
343
- name = name.replace('.', '_')
344
 
345
  p = prediction.tolist()[:-1]
346
  for i in range(len(p)):
347
- f.write(f"{i:<5}{sequence[i]:<20}{round(p[i], 4):<10}\n")
348
  output_files.append(str(output_filename_new.with_suffix('.txt')))
349
 
350
  output_files_enm = []
351
 
352
  for enm_prediction, name in zip(batch['enm_vals'], names):
353
- _outname_new = output_filename.with_name("{}".format(name.replace('.', '_')) + '_enm_' + output_filename.stem + '.txt')
354
  with open(_outname_new, 'w') as f:
355
  print("Saving ENM predictions to {}.".format(_outname_new))
356
  for enm_prediction, name in zip(batch['enm_vals'], names):
@@ -360,14 +309,13 @@ def flex_3d(input_file):
360
 
361
  if suffix == ".pdb" or suffix == ".pdb_list":
362
  for name, pdb_file, prediction in zip(names, pdb_files, predictions):
363
- chain_id = name.split('.')[1]
364
  _prediction = prediction[:-1].reshape(1,-1)
365
- _outname = output_filename.with_name('{}_'.format(name.replace('.', '_')) + output_filename.stem + '.pdb')
366
  print("Saving prediction to {}.".format(_outname))
367
- modify_bfactor_biotite(pdb_file, chain_id, _outname, _prediction) #writing the prediction without the last token
368
  output_files.append(str(_outname))
369
 
370
- _outname = output_filename.with_name(output_filename.stem + '_fasta.fasta')
371
  with open(_outname, 'w') as f:
372
  print("Saving fasta to {}.".format(_outname))
373
  for name, sequence in zip(names, sequences):
@@ -377,39 +325,70 @@ def flex_3d(input_file):
377
 
378
  if suffix == ".pdb" or suffix == ".pdb_list":
379
  for name, pdb_file, enm_vals_single in zip(names, pdb_files, batch['enm_vals']):
380
- _outname = output_filename.with_name('{}_enm_'.format(name.replace('.', '_')) + output_filename.stem + '.pdb')
381
  print("Saving ENM prediction to {}.".format(_outname))
382
- chain_id = name.split('.')[1]
383
  _enm_vals = enm_vals_single[:-1].reshape(1,-1)
384
- modify_bfactor_biotite(pdb_file, chain_id, _outname, _enm_vals) #writing the prediction without the last token
385
  output_files_enm.append(str(_outname))
386
 
387
- print(output_files_enm)
388
  return output_files, output_message, output_files_enm
389
 
390
  def rescale_bfactors(pdb_file):
391
-
392
  base, ext = os.path.splitext(pdb_file)
393
  # Create the new filename
394
- out_file = base + "_scaled" + ext
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
  parser = PDBParser(QUIET=True)
397
  structure = parser.get_structure("prot", pdb_file)
398
 
399
  # Collect all bfactors
400
  bfactors = [atom.bfactor for atom in structure.get_atoms()]
 
 
 
 
 
 
 
 
 
401
 
402
- min_b = min(bfactors)
403
- max_b = max(bfactors)
 
 
 
 
 
404
 
405
  def scale(b):
406
  if max_b == min_b:
407
- return 50.0 # arbitrary mid value
408
  return ((b - min_b) / (max_b - min_b))
409
 
410
  # Rescale all atoms
411
- for atom in structure.get_atoms():
412
- atom.set_bfactor(scale(atom.bfactor))
413
 
414
  # Save to the *new* file path
415
  io = PDBIO()
@@ -418,7 +397,15 @@ def rescale_bfactors(pdb_file):
418
 
419
  return out_file
420
 
 
 
 
 
 
 
421
  def handle_seq_prediction(input_seq, input_file):
 
 
422
  main_files, message = flex_seq(input_seq, input_file)
423
 
424
  fasta_index = next(
@@ -437,6 +424,8 @@ def handle_seq_prediction(input_seq, input_file):
437
 
438
 
439
  def handle_3d_prediction(input_file):
 
 
440
  main_files, message, enm_files = flex_3d(input_file)
441
 
442
  fasta_index = next(
@@ -454,9 +443,6 @@ def handle_3d_prediction(input_file):
454
 
455
  return main_files, message, pdb_files_for_viz
456
 
457
- def clear_inputs():
458
- return "", []
459
-
460
  PRIMARY = "primary"
461
  SECONDARY = "secondary"
462
 
@@ -500,7 +486,38 @@ gr.set_static_paths(["prediction_results"])
500
  with gr.Blocks(theme=theme) as demo:
501
  gr.Image("Flexpert_logo.png", show_label=False, interactive=False)
502
  gr.Markdown(value="""
503
- About Flexpert.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  """)
505
 
506
  with gr.Tab("Flexpert-Seq"):
@@ -511,14 +528,14 @@ with gr.Blocks(theme=theme) as demo:
511
  with gr.Column(visible=True) as col_text_input:
512
  input_seq = gr.Textbox(
513
  label="Paste Protein Sequences (FASTA format)",
514
- placeholder="ProteinName1:AGFASRGT...\nProteinName2:QWERTY...",
515
  lines=10,
516
  scale=2
517
  )
518
 
519
  # Column for File Input (Default: Hidden)
520
  with gr.Column(visible=False) as col_file_input:
521
- input_file = gr.File(label="Select (a) file/s containing one or more protein sequences", file_count="multiple")
522
 
523
  predict_seq = gr.Button("Predict")
524
 
@@ -545,7 +562,7 @@ with gr.Blocks(theme=theme) as demo:
545
 
546
 
547
  with gr.Tab("Flexpert-3D"):
548
- input_file_3d = gr.File(label="Select (a) 3D structure file(s) (.pdb (one or multiple), jsonl)", file_count = "multiple")
549
 
550
  predict_3d = gr.Button("Predict")
551
 
@@ -570,6 +587,9 @@ with gr.Blocks(theme=theme) as demo:
570
 
571
  clear_button = gr.ClearButton([input_seq, input_file, input_file_3d, output_text, molecule_output, output_files])
572
 
 
 
 
573
  # Connect the buttons to their respective functions.
574
  predict_seq.click(handle_seq_prediction, inputs=[input_seq, input_file], outputs=[output_files, output_text, molecule_output])
575
  predict_3d.click(handle_3d_prediction, inputs=[input_file_3d], outputs=[output_files, output_text, molecule_output])
 
1
  import sys
2
+ import os, shutil
3
  import gradio as gr
4
  from data.scripts.data_utils import parse_PDB
5
  from utils.utils import ClassConfig, DataCollatorForTokenRegression, process_in_batches_and_combine, get_dot_separated_name
 
20
  sys.path.insert(0, str(LOCAL_COMPONENT_PATH))
21
  from gradio_molecule3d.molecule3d import Molecule3D
22
  from Bio.PDB import PDBParser, PDBIO
23
+ from biotite.structure import annotate_sse
24
+ import biotite.structure.io as strucio
25
+ import biotite.structure.residues as residues
26
+ import numpy as np
27
 
28
  from data.scripts.data_utils import modify_bfactor_biotite
29
 
30
  def process_pdb_file(pdb_file, backbones, sequences, names):
31
+ _name = pdb_file[:-4]
32
+ _chain = ""
 
 
 
33
  parsed_pdb = parse_PDB(pdb_file, name=_name, input_chain_list=[_chain])[0]
34
  backbone, sequence = parsed_pdb['coords_chain_{}'.format(_chain)], parsed_pdb['seq_chain_{}'.format(_chain)]
35
  if len(sequence) > 1023:
36
+ print("Sequence length is greater than 1023, skipping {}".format(_name))
37
  else:
38
  backbones.append(backbone)
39
  sequences.append(sequence)
40
+ names.append(_name)
41
  return backbones, sequences, names
42
 
43
  def flex_seq(input_seq, input_file):
 
45
  input_seq = ""
46
 
47
  if not input_seq.strip() and not input_file:
48
+ return None, "Provide a file/s or a input sequence/s"
49
 
50
  if input_file:
51
  if len(input_file) == 1:
 
68
  if input_seq:
69
  suffix = ""
70
  proteins = input_seq.split('\n')
71
+ if len(proteins) % 2 != 0:
72
+ raise ValueError("You must adhere to the .fasta format")
73
+ for record in range(0, len(proteins), 2):
74
+ if ">" in proteins[record]:
75
+ name = proteins[record][1:]
76
+ sequence = proteins[record+1]
 
 
 
 
 
 
 
77
  else:
78
+ raise ValueError("You must adhere to the .fasta format")
79
 
80
+ if datapoint_for_eval == 'all':
81
  names.append(name)
82
  sequences.append(sequence)
83
  backbones.append(None)
84
 
85
  elif suffix == ".fasta":
86
  for record in SeqIO.parse(input_file, "fasta"):
87
+ name = record.name
88
+ if datapoint_for_eval == 'all':
89
+ names.append(name)
 
 
 
 
 
90
  sequences.append(str(record.seq))
91
  backbones.append(None)
92
 
 
94
  backbones, sequences, names = process_pdb_file(input_file, backbones, sequences, names)
95
  pdb_files.append(input_file)
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  elif suffix == ".pdb_list":
98
  for i in input_file:
99
  backbones, sequences, names = process_pdb_file(i, backbones, sequences, names)
 
108
  config = yaml.load(open('configs/train_config.yaml', 'r'), Loader=yaml.FullLoader)
109
  class_config=ClassConfig(config)
110
  class_config.adaptor_architecture = 'no-adaptor'
111
+ config['inference_args']['device'] = config['inference_args']['device'] if torch.cuda.is_available() else 'cpu'
112
  model, tokenizer = PT5_classification_model(half_precision=config['mixed_precision'], class_config=class_config)
113
  model.to(config['inference_args']['device'])
114
  state_dict = torch.load(config['inference_args']['seq_model_path'], map_location=config['inference_args']['device'])
 
128
  data_collator = DataCollatorForTokenRegression(tokenizer)
129
  batch = data_collator(data_to_collate) # Wrap in list since collator expects batch
130
  batch.to(model.device)
 
 
 
 
 
 
 
131
 
132
  # Predict
133
  with torch.no_grad():
 
135
  predictions = output_logits[:,:,0] #includes the prediction for the added token
136
  # subselect the predictions using the attention mask
137
 
138
+ output_filename = Path(config['inference_args']['prediction_output_dir'].format(output_name, "seq"))
139
  output_filename.parent.mkdir(parents=True, exist_ok=True)
140
  output_files = []
141
  output_message = "Success"
142
 
143
  for prediction, mask, name, sequence in zip(predictions, batch['attention_mask'], names, sequences):
144
+ output_filename_new = output_filename.with_stem("{}_".format(name.split("/")[-1]) + output_filename.stem)
145
  with open(output_filename_new.with_suffix('.txt'), 'w') as f:
146
+ f.write("Residue Number\tResidue ID\tFlexibility\n")
147
  prediction = prediction[mask.bool()]
148
  if len(prediction) != len(sequence)+1:
149
  print("Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1))
150
 
151
  assert len(prediction) == len(sequence)+1, "Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1)
 
 
152
 
153
  p = prediction.tolist()[:-1]
154
  for i in range(len(p)):
155
+ f.write(f"{i:<10}\t{sequence[i]:<20}\t{round(p[i], 4):<10}\n")
156
  output_files.append(str(output_filename_new.with_suffix('.txt')))
157
 
158
  if suffix == ".pdb" or suffix == ".pdb_list":
159
  for name, pdb_file, prediction in zip(names, pdb_files, predictions):
 
160
  _prediction = prediction[:-1].reshape(1,-1)
161
+ _outname = output_filename.with_name('{}_'.format(name.split("/")[-1]) + output_filename.stem + '.pdb')
162
  print("Saving prediction to {}.".format(_outname))
163
+ modify_bfactor_biotite(pdb_file, None, _outname, _prediction) #writing the prediction without the last token
164
  output_files.append(str(_outname))
165
 
166
+ _outname = output_filename.with_name(name.split("/")[-1] + output_filename.stem + '.fasta')
167
  with open(_outname, 'w') as f:
168
  print("Saving fasta to {}.".format(_outname))
169
  for name, sequence in zip(names, sequences):
 
235
  config = yaml.load(open('configs/train_config.yaml', 'r'), Loader=yaml.FullLoader)
236
  class_config=ClassConfig(config)
237
  class_config.adaptor_architecture = 'conv'
238
+ config['inference_args']['device'] = config['inference_args']['device'] if torch.cuda.is_available() else 'cpu'
239
  model, tokenizer = PT5_classification_model(half_precision=config['mixed_precision'], class_config=class_config)
240
 
241
  model.to(config['inference_args']['device'])
 
269
 
270
  batch = data_collator(data_to_collate) # Wrap in list since collator expects batch
271
  batch.to(model.device)
 
 
 
 
 
 
 
272
 
273
  # Predict
274
  with torch.no_grad():
 
276
  predictions = output_logits[:,:,0] #includes the prediction for the added token
277
  # subselect the predictions using the attention mask
278
 
279
+ output_filename = Path(config['inference_args']['prediction_output_dir'].format(output_name, "3D"))
280
  output_filename.parent.mkdir(parents=True, exist_ok=True)
281
  output_files = []
282
  output_message = "Success"
283
 
284
  for prediction, mask, name, sequence in zip(predictions, batch['attention_mask'], names, sequences):
285
+ output_filename_new = output_filename.with_stem("{}_".format(name.split("/")[-1]) + output_filename.stem)
286
  with open(output_filename_new.with_suffix('.txt'), 'w') as f:
287
+ f.write("Residue Number\tResidue ID\tFlexibility\n")
288
  prediction = prediction[mask.bool()]
289
  if len(prediction) != len(sequence)+1:
290
  print("Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1))
291
 
292
  assert len(prediction) == len(sequence)+1, "Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1)
 
 
293
 
294
  p = prediction.tolist()[:-1]
295
  for i in range(len(p)):
296
+ f.write(f"{i:<10}\t{sequence[i]:<20}\t{round(p[i], 4):<10}\n")
297
  output_files.append(str(output_filename_new.with_suffix('.txt')))
298
 
299
  output_files_enm = []
300
 
301
  for enm_prediction, name in zip(batch['enm_vals'], names):
302
+ _outname_new = output_filename.with_name("{}".format(name.split("/")[-1]) + '_enm_' + output_filename.stem + '.txt')
303
  with open(_outname_new, 'w') as f:
304
  print("Saving ENM predictions to {}.".format(_outname_new))
305
  for enm_prediction, name in zip(batch['enm_vals'], names):
 
309
 
310
  if suffix == ".pdb" or suffix == ".pdb_list":
311
  for name, pdb_file, prediction in zip(names, pdb_files, predictions):
 
312
  _prediction = prediction[:-1].reshape(1,-1)
313
+ _outname = output_filename.with_name('{}_'.format(name.split("/")[-1]) + output_filename.stem + '.pdb')
314
  print("Saving prediction to {}.".format(_outname))
315
+ modify_bfactor_biotite(pdb_file, None, _outname, _prediction) #writing the prediction without the last token
316
  output_files.append(str(_outname))
317
 
318
+ _outname = output_filename.with_name(name.split("/")[-1] + output_filename.stem + '.fasta')
319
  with open(_outname, 'w') as f:
320
  print("Saving fasta to {}.".format(_outname))
321
  for name, sequence in zip(names, sequences):
 
325
 
326
  if suffix == ".pdb" or suffix == ".pdb_list":
327
  for name, pdb_file, enm_vals_single in zip(names, pdb_files, batch['enm_vals']):
328
+ _outname = output_filename.with_name('{}_enm_'.format(name.split("/")[-1]) + output_filename.stem + '.pdb')
329
  print("Saving ENM prediction to {}.".format(_outname))
 
330
  _enm_vals = enm_vals_single[:-1].reshape(1,-1)
331
+ modify_bfactor_biotite(pdb_file, None, _outname, _enm_vals) #writing the prediction without the last token
332
  output_files_enm.append(str(_outname))
333
 
 
334
  return output_files, output_message, output_files_enm
335
 
336
  def rescale_bfactors(pdb_file):
 
337
  base, ext = os.path.splitext(pdb_file)
338
  # Create the new filename
339
+ out_file = base + "-scaled" + ext
340
+
341
+ atom_array = strucio.load_structure(pdb_file)
342
+ sse = annotate_sse(atom_array)
343
+
344
+ start = 0
345
+
346
+ for i, item in enumerate(sse):
347
+ if item == "a" or item == "b":
348
+ start = i
349
+ break
350
+
351
+ sse = sse[::-1]
352
+ end = 0
353
+
354
+ for i, item in enumerate(sse):
355
+ if item == "a" or item == "b":
356
+ end = i
357
+ break
358
+
359
+ end = len(sse) - end - 1
360
 
361
  parser = PDBParser(QUIET=True)
362
  structure = parser.get_structure("prot", pdb_file)
363
 
364
  # Collect all bfactors
365
  bfactors = [atom.bfactor for atom in structure.get_atoms()]
366
+
367
+ res_starts = residues.get_residue_starts(atom_array)
368
+
369
+ start = res_starts[start]
370
+ end = res_starts[end]
371
+
372
+ bfactors_start = bfactors[:start]
373
+ bfactors_end = bfactors[end:]
374
+ bfactors_struct = bfactors[start:end]
375
 
376
+ min_b = min(bfactors_struct)
377
+ max_b = max(bfactors_struct)
378
+
379
+ bfactors_start = np.clip(a = bfactors_start, min = min_b, max = max_b)
380
+ bfactors_end = np.clip(a = bfactors_end, min = min_b, max = max_b)
381
+
382
+ bfactors = np.concatenate((bfactors_start, bfactors_struct, bfactors_end))
383
 
384
  def scale(b):
385
  if max_b == min_b:
386
+ return 0.5 # arbitrary mid value
387
  return ((b - min_b) / (max_b - min_b))
388
 
389
  # Rescale all atoms
390
+ for i, atom in enumerate(structure.get_atoms()):
391
+ atom.set_bfactor(scale(bfactors[i]))
392
 
393
  # Save to the *new* file path
394
  io = PDBIO()
 
397
 
398
  return out_file
399
 
400
+ def clear_files():
401
+ folder = 'prediction_results/'
402
+ for filename in os.listdir(folder):
403
+ file_path = os.path.join(folder, filename)
404
+ os.remove(file_path)
405
+
406
  def handle_seq_prediction(input_seq, input_file):
407
+ clear_files()
408
+
409
  main_files, message = flex_seq(input_seq, input_file)
410
 
411
  fasta_index = next(
 
424
 
425
 
426
  def handle_3d_prediction(input_file):
427
+ clear_files()
428
+
429
  main_files, message, enm_files = flex_3d(input_file)
430
 
431
  fasta_index = next(
 
443
 
444
  return main_files, message, pdb_files_for_viz
445
 
 
 
 
446
  PRIMARY = "primary"
447
  SECONDARY = "secondary"
448
 
 
486
  with gr.Blocks(theme=theme) as demo:
487
  gr.Image("Flexpert_logo.png", show_label=False, interactive=False)
488
  gr.Markdown(value="""
489
+ ## About Flexpert
490
+
491
+ On the web-version of Flexpert you can calculate the per-residue flexibility of a protein by either inputting the protein as a string or through .pdb/.fasta files.
492
+
493
+ ### Inputs:
494
+
495
+ #### Flexpert-Seq:
496
+
497
+ * **Text** - Enter one or more proteins according to the specified format.
498
+ * **File** - Select either .fasta file containing one or more proteins, or one or more .pdb files with a single-chain protein in the file.
499
+ * **Note:** You can only select either **Text** or **File** input options per a single prediction.
500
+
501
+ #### Flexpert-3D:
502
+
503
+ * **File** - Select one or more .pdb files with a single-chain protein in the file.
504
+
505
+ ### Outputs:
506
+
507
+ #### Files:
508
+
509
+ * Depending on your input, different output files appear:
510
+ * A **.txt file** with the per-residue flexibility for all proteins **always appears**.
511
+ * A **.fasta file** appears with all the proteins.
512
+ * If you input a **.pdb file**, two .pdb files per protein appear, one with **'true'** per-residue flexibilities and **'scaled'** per-residue flexibilities.
513
+ * For Flexpert-3D, another **.pdb file** per protein also appears containing per-residue ENM values.
514
+
515
+ #### Visualisations:
516
+
517
+ * You will notice that there is a possibility of seeing a visualisation of the per-residue flexibility of the provided proteins. These visualisations can only appear if you predict the flexibility via a **.pdb file**.
518
+ * We provide both the **'real'** (flexibilities predicted by Flexpert) and the **'scaled'** (flexibilities normalised according to the maximum flexibility) visualisations.
519
+ * To toggle between visualisations, click the lower-most button on the side-panel (the brush) and then choose between files.
520
+
521
  """)
522
 
523
  with gr.Tab("Flexpert-Seq"):
 
528
  with gr.Column(visible=True) as col_text_input:
529
  input_seq = gr.Textbox(
530
  label="Paste Protein Sequences (FASTA format)",
531
+ placeholder=">ProteinName1\nAGFASRGT...\n>ProteinName2\nQWERTY...",
532
  lines=10,
533
  scale=2
534
  )
535
 
536
  # Column for File Input (Default: Hidden)
537
  with gr.Column(visible=False) as col_file_input:
538
+ input_file = gr.File(label="Select one or more .pdb files OR a .fasta file containing one or more proteins", file_count="multiple", file_types = ['.fasta', '.pdb'])
539
 
540
  predict_seq = gr.Button("Predict")
541
 
 
562
 
563
 
564
  with gr.Tab("Flexpert-3D"):
565
+ input_file_3d = gr.File(label="Select one or more .pdb files", file_count = "multiple", file_types = ['.pdb'])
566
 
567
  predict_3d = gr.Button("Predict")
568
 
 
587
 
588
  clear_button = gr.ClearButton([input_seq, input_file, input_file_3d, output_text, molecule_output, output_files])
589
 
590
+ with gr.Row():
591
+ logos = gr.Image("logos.png", show_label=False, interactive=False)
592
+
593
  # Connect the buttons to their respective functions.
594
  predict_seq.click(handle_seq_prediction, inputs=[input_seq, input_file], outputs=[output_files, output_text, molecule_output])
595
  predict_3d.click(handle_3d_prediction, inputs=[input_file_3d], outputs=[output_files, output_text, molecule_output])