Spaces:

Honzus24
/

flexpert

Sleeping

App Files Files Community

Honzus24 commited on Dec 12, 2025

Commit

c3d5dd7

verified ·

1 Parent(s): 18ee2cb

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -108

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import sys
-import os
 import gradio as gr
 from data.scripts.data_utils import parse_PDB
 from utils.utils import ClassConfig, DataCollatorForTokenRegression, process_in_batches_and_combine, get_dot_separated_name
@@ -20,23 +20,24 @@ LOCAL_COMPONENT_PATH = BASE_DIR / "gradio_molecule3d" / "backend"
 sys.path.insert(0, str(LOCAL_COMPONENT_PATH))
 from gradio_molecule3d.molecule3d import Molecule3D
 from Bio.PDB import PDBParser, PDBIO
 from data.scripts.data_utils import modify_bfactor_biotite
 def process_pdb_file(pdb_file, backbones, sequences, names):
-        parsed_name = os.path.splitext(os.path.basename(pdb_file))[0].split('_')
-        if len(parsed_name[0]) != 4 or len(parsed_name[1]) != 1 or not parsed_name[1].isalpha():
-            raise ValueError("PDB file name is expected to be in the format of 'name_chain.pdb', e.g.: 1BUI_C.pdb")
-        _name = parsed_name[0]
-        _chain = parsed_name[1]
         parsed_pdb = parse_PDB(pdb_file, name=_name, input_chain_list=[_chain])[0]
         backbone, sequence = parsed_pdb['coords_chain_{}'.format(_chain)], parsed_pdb['seq_chain_{}'.format(_chain)]
         if len(sequence) > 1023:
-            print("Sequence length is greater than 1023, skipping {}".format(_name + "." + _chain))
         else:
             backbones.append(backbone)
             sequences.append(sequence)
-            names.append(_name + "." + _chain)
         return backbones, sequences, names
 def flex_seq(input_seq, input_file):
@@ -44,7 +45,7 @@ def flex_seq(input_seq, input_file):
         input_seq = ""
     if not input_seq.strip() and not input_file:
-        return None, "Provide a file or a input sequence"
     if input_file:
         if len(input_file) == 1:
@@ -67,37 +68,25 @@ def flex_seq(input_seq, input_file):
     if input_seq:
         suffix = ""
         proteins = input_seq.split('\n')
-        for record in proteins:
-            if ':' in record:
-                s = record.split(":")
-                name = s[0]
-                sequence = s[1]
-            else:
-                raise ValueError("Sequence name must contain either an underscore or a dot to separate the PDB code and the chain code.")
-            # Normalize name: convert underscore to dot if present
-            if '_' in name:
-                name = '.'.join(name.split('_'))
-            elif '.' in name:
-                name = name  # keep dot as is
             else:
-                raise ValueError("Sequence name must contain either an underscore or a dot to separate the PDB code and the chain code.")
-            if datapoint_for_eval == 'all' or name in datapoint_for_eval:
                 names.append(name)
                 sequences.append(sequence)
                 backbones.append(None)
     elif suffix == ".fasta":
         for record in SeqIO.parse(input_file, "fasta"):
-            if '_' in record.name:
-                dot_separated_name = '.'.join(record.name.split('_'))
-            elif '.' in record.name:
-                dot_separated_name = record.name
-            else:
-                raise ValueError("Sequence name must contain either an underscore or a dot to separate the PDB code and the chain code.")
-            if datapoint_for_eval == 'all' or dot_separated_name in datapoint_for_eval:
-                names.append(dot_separated_name)
                 sequences.append(str(record.seq))
                 backbones.append(None)
@@ -105,29 +94,6 @@ def flex_seq(input_seq, input_file):
         backbones, sequences, names = process_pdb_file(input_file, backbones, sequences, names)
         pdb_files.append(input_file)
-    elif suffix == ".jsonl":
-        for line in open(input_file, 'r'):
-            _dict = json.loads(line)
-            if 'fluctuations' in _dict.keys():
-                print("fluctuations are precomputed, using them")
-                dot_separated_name = get_dot_separated_name(key='pdb_name', _dict=_dict)
-                if datapoint_for_eval == 'all' or dot_separated_name in datapoint_for_eval:
-                    names.append(_dict['pdb_name'])
-                    backbones.append(None)
-                    sequences.append(_dict['sequence'])
-                    flucts_list.append(_dict['fluctuations']+[0.0]) #padding for end cls token
-                continue
-            dot_separated_name = get_dot_separated_name(key='name', _dict=_dict)
-            if datapoint_for_eval == 'all' or dot_separated_name in datapoint_for_eval:
-                backbones.append(_dict['coords'])
-                sequences.append(_dict['seq'])
-                names.append(dot_separated_name)
     elif suffix == ".pdb_list":
         for i in input_file:
             backbones, sequences, names = process_pdb_file(i, backbones, sequences, names)
@@ -142,6 +108,7 @@ def flex_seq(input_seq, input_file):
     config = yaml.load(open('configs/train_config.yaml', 'r'), Loader=yaml.FullLoader)
     class_config=ClassConfig(config)
     class_config.adaptor_architecture = 'no-adaptor'
     model, tokenizer = PT5_classification_model(half_precision=config['mixed_precision'], class_config=class_config)
     model.to(config['inference_args']['device'])
     state_dict = torch.load(config['inference_args']['seq_model_path'], map_location=config['inference_args']['device'])
@@ -161,13 +128,6 @@ def flex_seq(input_seq, input_file):
     data_collator = DataCollatorForTokenRegression(tokenizer)
     batch = data_collator(data_to_collate)  # Wrap in list since collator expects batch
     batch.to(model.device)
-    for key in batch.keys():
-        print("___________-", key, "-___________")
-        for b in batch[key]:
-            if key == 'attention_mask':
-                print(b.sum())
-            else:
-                print(b.shape)
     # Predict
     with torch.no_grad():
@@ -175,38 +135,35 @@ def flex_seq(input_seq, input_file):
         predictions = output_logits[:,:,0] #includes the prediction for the added token
         # subselect the predictions using the attention mask
-    output_filename = Path(config['inference_args']['prediction_output_dir'].format(output_name, "seq", 'all'))
     output_filename.parent.mkdir(parents=True, exist_ok=True)
     output_files = []
     output_message = "Success"
     for prediction, mask, name, sequence in zip(predictions, batch['attention_mask'], names, sequences):
-        output_filename_new = output_filename.with_stem("{}_".format(name.replace('.', '_')) + output_filename.stem)
         with open(output_filename_new.with_suffix('.txt'), 'w') as f:
-            f.write("Residue Number    Residue ID   Flexibility\n")
             prediction = prediction[mask.bool()]
             if len(prediction) != len(sequence)+1:
                 print("Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1))
             assert len(prediction) == len(sequence)+1, "Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1)
-            if '.' in name:
-                name = name.replace('.', '_')
             p = prediction.tolist()[:-1]
             for i in range(len(p)):
-                f.write(f"{i:<5}{sequence[i]:<20}{round(p[i], 4):<10}\n")
         output_files.append(str(output_filename_new.with_suffix('.txt')))
     if suffix == ".pdb" or suffix == ".pdb_list":
         for name, pdb_file, prediction in zip(names, pdb_files, predictions):
-            chain_id = name.split('.')[1]
             _prediction = prediction[:-1].reshape(1,-1)
-            _outname = output_filename.with_name('{}_'.format(name.replace('.', '_')) + output_filename.stem + '.pdb')
             print("Saving prediction to {}.".format(_outname))
-            modify_bfactor_biotite(pdb_file, chain_id, _outname, _prediction) #writing the prediction without the last token
             output_files.append(str(_outname))
-    _outname = output_filename.with_name(output_filename.stem + '_fasta.fasta')
     with open(_outname, 'w') as f:
         print("Saving fasta to {}.".format(_outname))
         for name, sequence in zip(names, sequences):
@@ -278,6 +235,7 @@ def flex_3d(input_file):
     config = yaml.load(open('configs/train_config.yaml', 'r'), Loader=yaml.FullLoader)
     class_config=ClassConfig(config)
     class_config.adaptor_architecture = 'conv'
     model, tokenizer = PT5_classification_model(half_precision=config['mixed_precision'], class_config=class_config)
     model.to(config['inference_args']['device'])
@@ -311,13 +269,6 @@ def flex_3d(input_file):
     batch = data_collator(data_to_collate)  # Wrap in list since collator expects batch
     batch.to(model.device)
-    for key in batch.keys():
-        print("___________-", key, "-___________")
-        for b in batch[key]:
-            if key == 'attention_mask':
-                print(b.sum())
-            else:
-                print(b.shape)
     # Predict
     with torch.no_grad():
@@ -325,32 +276,30 @@ def flex_3d(input_file):
         predictions = output_logits[:,:,0] #includes the prediction for the added token
         # subselect the predictions using the attention mask
-    output_filename = Path(config['inference_args']['prediction_output_dir'].format(output_name, "3D", 'all'))
     output_filename.parent.mkdir(parents=True, exist_ok=True)
     output_files = []
     output_message = "Success"
     for prediction, mask, name, sequence in zip(predictions, batch['attention_mask'], names, sequences):
-        output_filename_new = output_filename.with_stem("{}_".format(name.replace('.', '_')) + output_filename.stem)
         with open(output_filename_new.with_suffix('.txt'), 'w') as f:
-            f.write("Residue Number    Residue ID   Flexibility\n")
             prediction = prediction[mask.bool()]
             if len(prediction) != len(sequence)+1:
                 print("Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1))
             assert len(prediction) == len(sequence)+1, "Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1)
-            if '.' in name:
-                name = name.replace('.', '_')
             p = prediction.tolist()[:-1]
             for i in range(len(p)):
-                f.write(f"{i:<5}{sequence[i]:<20}{round(p[i], 4):<10}\n")
         output_files.append(str(output_filename_new.with_suffix('.txt')))
     output_files_enm = []
     for enm_prediction, name in zip(batch['enm_vals'], names):
-        _outname_new = output_filename.with_name("{}".format(name.replace('.', '_')) + '_enm_' + output_filename.stem + '.txt')
         with open(_outname_new, 'w') as f:
             print("Saving ENM predictions to {}.".format(_outname_new))
             for enm_prediction, name in zip(batch['enm_vals'], names):
@@ -360,14 +309,13 @@ def flex_3d(input_file):
     if suffix == ".pdb" or suffix == ".pdb_list":
         for name, pdb_file, prediction in zip(names, pdb_files, predictions):
-            chain_id = name.split('.')[1]
             _prediction = prediction[:-1].reshape(1,-1)
-            _outname = output_filename.with_name('{}_'.format(name.replace('.', '_')) + output_filename.stem + '.pdb')
             print("Saving prediction to {}.".format(_outname))
-            modify_bfactor_biotite(pdb_file, chain_id, _outname, _prediction) #writing the prediction without the last token
             output_files.append(str(_outname))
-    _outname = output_filename.with_name(output_filename.stem + '_fasta.fasta')
     with open(_outname, 'w') as f:
         print("Saving fasta to {}.".format(_outname))
         for name, sequence in zip(names, sequences):
@@ -377,39 +325,70 @@ def flex_3d(input_file):
     if suffix == ".pdb" or suffix == ".pdb_list":
         for name, pdb_file, enm_vals_single in zip(names, pdb_files, batch['enm_vals']):
-            _outname = output_filename.with_name('{}_enm_'.format(name.replace('.', '_')) + output_filename.stem + '.pdb')
             print("Saving ENM prediction to {}.".format(_outname))
-            chain_id = name.split('.')[1]
             _enm_vals = enm_vals_single[:-1].reshape(1,-1)
-            modify_bfactor_biotite(pdb_file, chain_id, _outname, _enm_vals) #writing the prediction without the last token
             output_files_enm.append(str(_outname))
-    print(output_files_enm)
     return output_files, output_message, output_files_enm
 def rescale_bfactors(pdb_file):
     base, ext = os.path.splitext(pdb_file)
     # Create the new filename
-    out_file = base + "_scaled" + ext
     parser = PDBParser(QUIET=True)
     structure = parser.get_structure("prot", pdb_file)
     # Collect all bfactors
     bfactors = [atom.bfactor for atom in structure.get_atoms()]
-    min_b = min(bfactors)
-    max_b = max(bfactors)
     def scale(b):
         if max_b == min_b:
-            return 50.0  # arbitrary mid value
         return ((b - min_b) / (max_b - min_b))
     # Rescale all atoms
-    for atom in structure.get_atoms():
-        atom.set_bfactor(scale(atom.bfactor))
     # Save to the *new* file path
     io = PDBIO()
@@ -418,7 +397,15 @@ def rescale_bfactors(pdb_file):
     return out_file
 def handle_seq_prediction(input_seq, input_file):
     main_files, message = flex_seq(input_seq, input_file)
     fasta_index = next(
@@ -437,6 +424,8 @@ def handle_seq_prediction(input_seq, input_file):
 def handle_3d_prediction(input_file):
     main_files, message, enm_files = flex_3d(input_file)
     fasta_index = next(
@@ -454,9 +443,6 @@ def handle_3d_prediction(input_file):
     return main_files, message, pdb_files_for_viz
-def clear_inputs():
-    return "", []
 PRIMARY = "primary"
 SECONDARY = "secondary"
@@ -500,7 +486,38 @@ gr.set_static_paths(["prediction_results"])
 with gr.Blocks(theme=theme) as demo:
     gr.Image("Flexpert_logo.png", show_label=False, interactive=False)
     gr.Markdown(value="""
-        About Flexpert.
         """)
     with gr.Tab("Flexpert-Seq"):
@@ -511,14 +528,14 @@ with gr.Blocks(theme=theme) as demo:
         with gr.Column(visible=True) as col_text_input:
             input_seq = gr.Textbox(
                 label="Paste Protein Sequences (FASTA format)",
-                placeholder="ProteinName1:AGFASRGT...\nProteinName2:QWERTY...",
                 lines=10,
                 scale=2
             )
         # Column for File Input (Default: Hidden)
         with gr.Column(visible=False) as col_file_input:
-            input_file = gr.File(label="Select (a) file/s containing one or more protein sequences", file_count="multiple")
         predict_seq = gr.Button("Predict")
@@ -545,7 +562,7 @@ with gr.Blocks(theme=theme) as demo:
     with gr.Tab("Flexpert-3D"):
-        input_file_3d = gr.File(label="Select (a) 3D structure file(s) (.pdb (one or multiple), jsonl)", file_count = "multiple")
         predict_3d = gr.Button("Predict")
@@ -570,6 +587,9 @@ with gr.Blocks(theme=theme) as demo:
     clear_button = gr.ClearButton([input_seq, input_file, input_file_3d, output_text, molecule_output, output_files])
     # Connect the buttons to their respective functions.
     predict_seq.click(handle_seq_prediction, inputs=[input_seq, input_file], outputs=[output_files, output_text, molecule_output])
     predict_3d.click(handle_3d_prediction, inputs=[input_file_3d], outputs=[output_files, output_text, molecule_output])

 import sys
+import os, shutil
 import gradio as gr
 from data.scripts.data_utils import parse_PDB
 from utils.utils import ClassConfig, DataCollatorForTokenRegression, process_in_batches_and_combine, get_dot_separated_name
 sys.path.insert(0, str(LOCAL_COMPONENT_PATH))
 from gradio_molecule3d.molecule3d import Molecule3D
 from Bio.PDB import PDBParser, PDBIO
+from biotite.structure import annotate_sse
+import biotite.structure.io as strucio
+import biotite.structure.residues as residues
+import numpy as np
 from data.scripts.data_utils import modify_bfactor_biotite
 def process_pdb_file(pdb_file, backbones, sequences, names):
+        _name = pdb_file[:-4]
+        _chain = ""
         parsed_pdb = parse_PDB(pdb_file, name=_name, input_chain_list=[_chain])[0]
         backbone, sequence = parsed_pdb['coords_chain_{}'.format(_chain)], parsed_pdb['seq_chain_{}'.format(_chain)]
         if len(sequence) > 1023:
+            print("Sequence length is greater than 1023, skipping {}".format(_name))
         else:
             backbones.append(backbone)
             sequences.append(sequence)
+            names.append(_name)
         return backbones, sequences, names
 def flex_seq(input_seq, input_file):
         input_seq = ""
     if not input_seq.strip() and not input_file:
+        return None, "Provide a file/s or a input sequence/s"
     if input_file:
         if len(input_file) == 1:
     if input_seq:
         suffix = ""
         proteins = input_seq.split('\n')
+        if len(proteins) % 2 != 0:
+            raise ValueError("You must adhere to the .fasta format")
+        for record in range(0, len(proteins), 2):
+            if ">" in proteins[record]:
+                name = proteins[record][1:]
+                sequence = proteins[record+1]
             else:
+                raise ValueError("You must adhere to the .fasta format")
+            if datapoint_for_eval == 'all':
                 names.append(name)
                 sequences.append(sequence)
                 backbones.append(None)
     elif suffix == ".fasta":
         for record in SeqIO.parse(input_file, "fasta"):
+            name = record.name
+            if datapoint_for_eval == 'all':
+                names.append(name)
                 sequences.append(str(record.seq))
                 backbones.append(None)
         backbones, sequences, names = process_pdb_file(input_file, backbones, sequences, names)
         pdb_files.append(input_file)
     elif suffix == ".pdb_list":
         for i in input_file:
             backbones, sequences, names = process_pdb_file(i, backbones, sequences, names)
     config = yaml.load(open('configs/train_config.yaml', 'r'), Loader=yaml.FullLoader)
     class_config=ClassConfig(config)
     class_config.adaptor_architecture = 'no-adaptor'
+    config['inference_args']['device'] = config['inference_args']['device'] if torch.cuda.is_available() else 'cpu'
     model, tokenizer = PT5_classification_model(half_precision=config['mixed_precision'], class_config=class_config)
     model.to(config['inference_args']['device'])
     state_dict = torch.load(config['inference_args']['seq_model_path'], map_location=config['inference_args']['device'])
     data_collator = DataCollatorForTokenRegression(tokenizer)
     batch = data_collator(data_to_collate)  # Wrap in list since collator expects batch
     batch.to(model.device)
     # Predict
     with torch.no_grad():
         predictions = output_logits[:,:,0] #includes the prediction for the added token
         # subselect the predictions using the attention mask
+    output_filename = Path(config['inference_args']['prediction_output_dir'].format(output_name, "seq"))
     output_filename.parent.mkdir(parents=True, exist_ok=True)
     output_files = []
     output_message = "Success"
     for prediction, mask, name, sequence in zip(predictions, batch['attention_mask'], names, sequences):
+        output_filename_new = output_filename.with_stem("{}_".format(name.split("/")[-1]) + output_filename.stem)
         with open(output_filename_new.with_suffix('.txt'), 'w') as f:
+            f.write("Residue Number\tResidue ID\tFlexibility\n")
             prediction = prediction[mask.bool()]
             if len(prediction) != len(sequence)+1:
                 print("Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1))
             assert len(prediction) == len(sequence)+1, "Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1)
             p = prediction.tolist()[:-1]
             for i in range(len(p)):
+                f.write(f"{i:<10}\t{sequence[i]:<20}\t{round(p[i], 4):<10}\n")
         output_files.append(str(output_filename_new.with_suffix('.txt')))
     if suffix == ".pdb" or suffix == ".pdb_list":
         for name, pdb_file, prediction in zip(names, pdb_files, predictions):
             _prediction = prediction[:-1].reshape(1,-1)
+            _outname = output_filename.with_name('{}_'.format(name.split("/")[-1]) + output_filename.stem + '.pdb')
             print("Saving prediction to {}.".format(_outname))
+            modify_bfactor_biotite(pdb_file, None, _outname, _prediction) #writing the prediction without the last token
             output_files.append(str(_outname))
+    _outname = output_filename.with_name(name.split("/")[-1] + output_filename.stem + '.fasta')
     with open(_outname, 'w') as f:
         print("Saving fasta to {}.".format(_outname))
         for name, sequence in zip(names, sequences):
     config = yaml.load(open('configs/train_config.yaml', 'r'), Loader=yaml.FullLoader)
     class_config=ClassConfig(config)
     class_config.adaptor_architecture = 'conv'
+    config['inference_args']['device'] = config['inference_args']['device'] if torch.cuda.is_available() else 'cpu'
     model, tokenizer = PT5_classification_model(half_precision=config['mixed_precision'], class_config=class_config)
     model.to(config['inference_args']['device'])
     batch = data_collator(data_to_collate)  # Wrap in list since collator expects batch
     batch.to(model.device)
     # Predict
     with torch.no_grad():
         predictions = output_logits[:,:,0] #includes the prediction for the added token
         # subselect the predictions using the attention mask
+    output_filename = Path(config['inference_args']['prediction_output_dir'].format(output_name, "3D"))
     output_filename.parent.mkdir(parents=True, exist_ok=True)
     output_files = []
     output_message = "Success"
     for prediction, mask, name, sequence in zip(predictions, batch['attention_mask'], names, sequences):
+        output_filename_new = output_filename.with_stem("{}_".format(name.split("/")[-1]) + output_filename.stem)
         with open(output_filename_new.with_suffix('.txt'), 'w') as f:
+            f.write("Residue Number\tResidue ID\tFlexibility\n")
             prediction = prediction[mask.bool()]
             if len(prediction) != len(sequence)+1:
                 print("Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1))
             assert len(prediction) == len(sequence)+1, "Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1)
             p = prediction.tolist()[:-1]
             for i in range(len(p)):
+                f.write(f"{i:<10}\t{sequence[i]:<20}\t{round(p[i], 4):<10}\n")
         output_files.append(str(output_filename_new.with_suffix('.txt')))
     output_files_enm = []
     for enm_prediction, name in zip(batch['enm_vals'], names):
+        _outname_new = output_filename.with_name("{}".format(name.split("/")[-1]) + '_enm_' + output_filename.stem + '.txt')
         with open(_outname_new, 'w') as f:
             print("Saving ENM predictions to {}.".format(_outname_new))
             for enm_prediction, name in zip(batch['enm_vals'], names):
     if suffix == ".pdb" or suffix == ".pdb_list":
         for name, pdb_file, prediction in zip(names, pdb_files, predictions):
             _prediction = prediction[:-1].reshape(1,-1)
+            _outname = output_filename.with_name('{}_'.format(name.split("/")[-1]) + output_filename.stem + '.pdb')
             print("Saving prediction to {}.".format(_outname))
+            modify_bfactor_biotite(pdb_file, None, _outname, _prediction) #writing the prediction without the last token
             output_files.append(str(_outname))
+    _outname = output_filename.with_name(name.split("/")[-1] + output_filename.stem + '.fasta')
     with open(_outname, 'w') as f:
         print("Saving fasta to {}.".format(_outname))
         for name, sequence in zip(names, sequences):
     if suffix == ".pdb" or suffix == ".pdb_list":
         for name, pdb_file, enm_vals_single in zip(names, pdb_files, batch['enm_vals']):
+            _outname = output_filename.with_name('{}_enm_'.format(name.split("/")[-1]) + output_filename.stem + '.pdb')
             print("Saving ENM prediction to {}.".format(_outname))
             _enm_vals = enm_vals_single[:-1].reshape(1,-1)
+            modify_bfactor_biotite(pdb_file, None, _outname, _enm_vals) #writing the prediction without the last token
             output_files_enm.append(str(_outname))
     return output_files, output_message, output_files_enm
 def rescale_bfactors(pdb_file):
     base, ext = os.path.splitext(pdb_file)
     # Create the new filename
+    out_file = base + "-scaled" + ext
+    atom_array = strucio.load_structure(pdb_file)
+    sse = annotate_sse(atom_array)
+    start = 0
+    for i, item in enumerate(sse):
+        if item == "a" or item == "b":
+            start = i
+            break
+    sse = sse[::-1]
+    end = 0
+    for i, item in enumerate(sse):
+        if item == "a" or item == "b":
+            end = i
+            break
+    end = len(sse) - end - 1
     parser = PDBParser(QUIET=True)
     structure = parser.get_structure("prot", pdb_file)
     # Collect all bfactors
     bfactors = [atom.bfactor for atom in structure.get_atoms()]
+    res_starts = residues.get_residue_starts(atom_array)
+    start = res_starts[start]
+    end = res_starts[end]
+    bfactors_start = bfactors[:start]
+    bfactors_end = bfactors[end:]
+    bfactors_struct = bfactors[start:end]
+    min_b = min(bfactors_struct)
+    max_b = max(bfactors_struct)
+    bfactors_start = np.clip(a = bfactors_start, min = min_b, max = max_b)
+    bfactors_end = np.clip(a = bfactors_end, min = min_b, max = max_b)
+    bfactors = np.concatenate((bfactors_start, bfactors_struct, bfactors_end))
     def scale(b):
         if max_b == min_b:
+            return 0.5  # arbitrary mid value
         return ((b - min_b) / (max_b - min_b))
     # Rescale all atoms
+    for i, atom in enumerate(structure.get_atoms()):
+        atom.set_bfactor(scale(bfactors[i]))
     # Save to the *new* file path
     io = PDBIO()
     return out_file
+def clear_files():
+    folder = 'prediction_results/'
+    for filename in os.listdir(folder):
+        file_path = os.path.join(folder, filename)
+        os.remove(file_path)
 def handle_seq_prediction(input_seq, input_file):
+    clear_files()
     main_files, message = flex_seq(input_seq, input_file)
     fasta_index = next(
 def handle_3d_prediction(input_file):
+    clear_files()
     main_files, message, enm_files = flex_3d(input_file)
     fasta_index = next(
     return main_files, message, pdb_files_for_viz
 PRIMARY = "primary"
 SECONDARY = "secondary"
 with gr.Blocks(theme=theme) as demo:
     gr.Image("Flexpert_logo.png", show_label=False, interactive=False)
     gr.Markdown(value="""
+        ## About Flexpert
+        On the web-version of Flexpert you can calculate the per-residue flexibility of a protein by either inputting the protein as a string or through .pdb/.fasta files.
+        ### Inputs:
+        #### Flexpert-Seq:
+        * **Text** - Enter one or more proteins according to the specified format.
+        * **File** - Select either .fasta file containing one or more proteins, or one or more .pdb files with a single-chain protein in the file.
+        * **Note:** You can only select either **Text** or **File** input options per a single prediction.
+        #### Flexpert-3D:
+        * **File** - Select one or more .pdb files with a single-chain protein in the file.
+        ### Outputs:
+        #### Files:
+        * Depending on your input, different output files appear:
+            * A **.txt file** with the per-residue flexibility for all proteins **always appears**.
+            * A **.fasta file** appears with all the proteins.
+            * If you input a **.pdb file**, two .pdb files per protein appear, one with **'true'** per-residue flexibilities and **'scaled'** per-residue flexibilities.
+            * For Flexpert-3D, another **.pdb file** per protein also appears containing per-residue ENM values.
+        #### Visualisations:
+        * You will notice that there is a possibility of seeing a visualisation of the per-residue flexibility of the provided proteins. These visualisations can only appear if you predict the flexibility via a **.pdb file**.
+        * We provide both the **'real'** (flexibilities predicted by Flexpert) and the **'scaled'** (flexibilities normalised according to the maximum flexibility) visualisations.
+        * To toggle between visualisations, click the lower-most button on the side-panel (the brush) and then choose between files.
         """)
     with gr.Tab("Flexpert-Seq"):
         with gr.Column(visible=True) as col_text_input:
             input_seq = gr.Textbox(
                 label="Paste Protein Sequences (FASTA format)",
+                placeholder=">ProteinName1\nAGFASRGT...\n>ProteinName2\nQWERTY...",
                 lines=10,
                 scale=2
             )
         # Column for File Input (Default: Hidden)
         with gr.Column(visible=False) as col_file_input:
+            input_file = gr.File(label="Select one or more .pdb files OR a .fasta file containing one or more proteins", file_count="multiple", file_types = ['.fasta', '.pdb'])
         predict_seq = gr.Button("Predict")
     with gr.Tab("Flexpert-3D"):
+        input_file_3d = gr.File(label="Select one or more .pdb files", file_count = "multiple", file_types = ['.pdb'])
         predict_3d = gr.Button("Predict")
     clear_button = gr.ClearButton([input_seq, input_file, input_file_3d, output_text, molecule_output, output_files])
+    with gr.Row():
+        logos = gr.Image("logos.png", show_label=False, interactive=False)
     # Connect the buttons to their respective functions.
     predict_seq.click(handle_seq_prediction, inputs=[input_seq, input_file], outputs=[output_files, output_text, molecule_output])
     predict_3d.click(handle_3d_prediction, inputs=[input_file_3d], outputs=[output_files, output_text, molecule_output])