Spaces:

Honzus24
/

flexpert

Running on Zero

App Files Files Community

Honzus24 commited on Feb 27

Commit

6886821

verified ·

1 Parent(s): c707c1f

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -48

app.py CHANGED Viewed

@@ -92,14 +92,13 @@ def process_pdb_file(pdb_file, backbones, sequences, names):
         names.append(_name)
     return backbones, sequences, names
-@spaces.GPU
-def flex_seq(input_seq, input_file):
     if not input_seq:
         input_seq = ""
     if not input_seq.strip() and not input_file:
-        return None, "Provide a file/s or a input sequence/s"
     if input_file:
         if len(input_file) == 1:
             input_file = input_file[0]
@@ -109,13 +108,7 @@ def flex_seq(input_seq, input_file):
     default_name = '{}'.format(datetime.now().strftime('%Y%m%d_%H%M%S'))
     output_name = default_name
-    sequences = []
-    names = []
-    backbones = []
-    flucts_list = []
-    pdb_files = []
     datapoint_for_eval = 'all'
     if input_seq:
@@ -129,12 +122,10 @@ def flex_seq(input_seq, input_file):
                 sequence = proteins[record+1]
             else:
                 raise ValueError("You must adhere to the .fasta format")
             if datapoint_for_eval == 'all':
                 names.append(name)
                 sequences.append(sequence)
                 backbones.append(None)
     elif suffix == ".fasta":
         for record in SeqIO.parse(input_file, "fasta"):
             name = record.name
@@ -142,59 +133,56 @@ def flex_seq(input_seq, input_file):
                 names.append(name)
                 sequences.append(str(record.seq))
                 backbones.append(None)
     elif suffix == ".pdb":
         backbones, sequences, names = process_pdb_file(input_file, backbones, sequences, names)
         pdb_files.append(input_file)
     elif suffix == ".pdb_list":
         for i in input_file:
             backbones, sequences, names = process_pdb_file(i, backbones, sequences, names)
             pdb_files.append(i)
     env_config = yaml.load(open('configs/env_config.yaml', 'r'), Loader=yaml.FullLoader)
-    # Set folder for huggingface cache
     os.environ['HF_HOME'] = env_config['huggingface']['HF_HOME']
-    # Set gpu device
-    os.environ["CUDA_VISIBLE_DEVICES"]= env_config['gpus']['cuda_visible_device']
     config = yaml.load(open('configs/train_config.yaml', 'r'), Loader=yaml.FullLoader)
-    class_config=ClassConfig(config)
     class_config.adaptor_architecture = 'no-adaptor'
-    config['inference_args']['device'] = config['inference_args']['device'] if torch.cuda.is_available() else 'cpu'
     model, tokenizer = PT5_classification_model(half_precision=config['mixed_precision'], class_config=class_config)
-    model.to(config['inference_args']['device'])
     repo_id = "Honzus24/Flexpert_weights"
     file_weights = config['inference_args']['seq_model_path']
-    # Get path (instant if cached)
     weights_path = get_weights_path(repo_id, file_weights)
-    # Load weights
-    state_dict = torch.load(weights_path, map_location=config['inference_args']['device'])
     model.load_state_dict(state_dict, strict=False)
     model.eval()
     data_to_collate = []
     for idx, (backbone, sequence) in enumerate(zip(backbones, sequences)):
-        #Ensure that the missing residues in the sequence are not represented as '-' but as 'X'
-        sequence = sequence.replace('-', 'X') #due to the tokenizer vocabulary
         tokenizer_out = tokenizer(' '.join(sequence), add_special_tokens=True, return_tensors='pt')
-        tokenized_seq, attention_mask = tokenizer_out['input_ids'].to(config['inference_args']['device']), tokenizer_out['attention_mask'].to(config['inference_args']['device'])
         data_to_collate.append({'input_ids': tokenized_seq[0,:], 'attention_mask': attention_mask[0,:]})
     data_collator = DataCollatorForTokenRegression(tokenizer)
-    batch = data_collator(data_to_collate)  # Wrap in list since collator expects batch
-    batch.to(model.device)
-    # Predict
     with torch.no_grad():
         output_logits = process_in_batches_and_combine(model, batch, config['inference_args']['batch_size'])
-        predictions = output_logits[:,:,0] #includes the prediction for the added token
-        # subselect the predictions using the attention mask
     output_filename = Path(config['inference_args']['prediction_output_dir'].format(output_name, "seq"))
     output_filename.parent.mkdir(parents=True, exist_ok=True)
     output_files = []
@@ -205,11 +193,7 @@ def flex_seq(input_seq, input_file):
         with open(output_filename_new.with_suffix('.txt'), 'w') as f:
             f.write("Residue Number\tResidue ID\tFlexibility\n")
             prediction = prediction[mask.bool()]
-            if len(prediction) != len(sequence)+1:
-                print("Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1))
             assert len(prediction) == len(sequence)+1, "Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1)
             p = prediction.tolist()[:-1]
             for i in range(len(p)):
                 f.write(f"{i:<10}\t{sequence[i]:<20}\t{round(p[i], 4):<10}\n")
@@ -220,19 +204,32 @@ def flex_seq(input_seq, input_file):
             _prediction = prediction[:-1].reshape(1,-1)
             _outname = output_filename.with_name('{}_'.format(name.split("/")[-1]) + output_filename.stem + '.pdb')
             print("Saving prediction to {}.".format(_outname))
-            modify_bfactor_biotite(pdb_file, None, _outname, _prediction) #writing the prediction without the last token
             output_files.append(str(_outname))
     _outname = output_filename.with_name(name.split("/")[-1] + output_filename.stem + '.fasta')
     with open(_outname, 'w') as f:
         print("Saving fasta to {}.".format(_outname))
         for name, sequence in zip(names, sequences):
             f.write('>' + name + '\n')
             f.write(sequence + '\n')
-        output_files.append(str(_outname))
     return output_files, output_message
 @spaces.GPU
 def flex_3d(input_file):
     if not input_file:

         names.append(_name)
     return backbones, sequences, names
+def core_flex_seq(input_seq, input_file, force_cpu=False):
+    """Core logic decoupled from the GPU decorator."""
     if not input_seq:
         input_seq = ""
     if not input_seq.strip() and not input_file:
+        return None, "Provide a file/s or an input sequence/s"
     if input_file:
         if len(input_file) == 1:
             input_file = input_file[0]
     default_name = '{}'.format(datetime.now().strftime('%Y%m%d_%H%M%S'))
     output_name = default_name
+    sequences, names, backbones, flucts_list, pdb_files = [], [], [], [], []
     datapoint_for_eval = 'all'
     if input_seq:
                 sequence = proteins[record+1]
             else:
                 raise ValueError("You must adhere to the .fasta format")
             if datapoint_for_eval == 'all':
                 names.append(name)
                 sequences.append(sequence)
                 backbones.append(None)
     elif suffix == ".fasta":
         for record in SeqIO.parse(input_file, "fasta"):
             name = record.name
                 names.append(name)
                 sequences.append(str(record.seq))
                 backbones.append(None)
     elif suffix == ".pdb":
         backbones, sequences, names = process_pdb_file(input_file, backbones, sequences, names)
         pdb_files.append(input_file)
     elif suffix == ".pdb_list":
         for i in input_file:
             backbones, sequences, names = process_pdb_file(i, backbones, sequences, names)
             pdb_files.append(i)
     env_config = yaml.load(open('configs/env_config.yaml', 'r'), Loader=yaml.FullLoader)
     os.environ['HF_HOME'] = env_config['huggingface']['HF_HOME']
+    os.environ["CUDA_VISIBLE_DEVICES"] = env_config['gpus']['cuda_visible_device']
     config = yaml.load(open('configs/train_config.yaml', 'r'), Loader=yaml.FullLoader)
+    class_config = ClassConfig(config)
     class_config.adaptor_architecture = 'no-adaptor'
+    # --- DEVICE OVERRIDE LOGIC ---
+    if force_cpu:
+        target_device = 'cpu'
+    else:
+        target_device = config['inference_args']['device'] if torch.cuda.is_available() else 'cpu'
+    config['inference_args']['device'] = target_device
     model, tokenizer = PT5_classification_model(half_precision=config['mixed_precision'], class_config=class_config)
+    model.to(target_device)
     repo_id = "Honzus24/Flexpert_weights"
     file_weights = config['inference_args']['seq_model_path']
     weights_path = get_weights_path(repo_id, file_weights)
+    state_dict = torch.load(weights_path, map_location=target_device)
     model.load_state_dict(state_dict, strict=False)
     model.eval()
     data_to_collate = []
     for idx, (backbone, sequence) in enumerate(zip(backbones, sequences)):
+        sequence = sequence.replace('-', 'X')
         tokenizer_out = tokenizer(' '.join(sequence), add_special_tokens=True, return_tensors='pt')
+        tokenized_seq = tokenizer_out['input_ids'].to(target_device)
+        attention_mask = tokenizer_out['attention_mask'].to(target_device)
         data_to_collate.append({'input_ids': tokenized_seq[0,:], 'attention_mask': attention_mask[0,:]})
     data_collator = DataCollatorForTokenRegression(tokenizer)
+    batch = data_collator(data_to_collate)
+    batch.to(target_device)
     with torch.no_grad():
         output_logits = process_in_batches_and_combine(model, batch, config['inference_args']['batch_size'])
+        predictions = output_logits[:,:,0]
     output_filename = Path(config['inference_args']['prediction_output_dir'].format(output_name, "seq"))
     output_filename.parent.mkdir(parents=True, exist_ok=True)
     output_files = []
         with open(output_filename_new.with_suffix('.txt'), 'w') as f:
             f.write("Residue Number\tResidue ID\tFlexibility\n")
             prediction = prediction[mask.bool()]
             assert len(prediction) == len(sequence)+1, "Prediction length {} is not equal to sequence length + 1 {}".format(len(prediction), len(sequence)+1)
             p = prediction.tolist()[:-1]
             for i in range(len(p)):
                 f.write(f"{i:<10}\t{sequence[i]:<20}\t{round(p[i], 4):<10}\n")
             _prediction = prediction[:-1].reshape(1,-1)
             _outname = output_filename.with_name('{}_'.format(name.split("/")[-1]) + output_filename.stem + '.pdb')
             print("Saving prediction to {}.".format(_outname))
+            modify_bfactor_biotite(pdb_file, None, _outname, _prediction)
             output_files.append(str(_outname))
     _outname = output_filename.with_name(name.split("/")[-1] + output_filename.stem + '.fasta')
     with open(_outname, 'w') as f:
         print("Saving fasta to {}.".format(_outname))
         for name, sequence in zip(names, sequences):
             f.write('>' + name + '\n')
             f.write(sequence + '\n')
+    output_files.append(str(_outname))
     return output_files, output_message
+@spaces.GPU
+def flex_seq_gpu(input_seq, input_file):
+    return core_flex_seq(input_seq, input_file, force_cpu=False)
+def flex_seq(input_seq, input_file):
+    try:
+        return flex_seq_gpu(input_seq, input_file)
+    except Exception as e:
+        # ZeroGPU exceptions (like SpaceTaskError or timeouts) are caught here
+        print(f"ZeroGPU failed or timed out. Reason: {e}")
+        print("Falling back to CPU execution. This may take a while...")
+        return core_flex_seq(input_seq, input_file, force_cpu=True)
 @spaces.GPU
 def flex_3d(input_file):
     if not input_file: