mila-intel
/

ProtST-esm1b

@@ -5,7 +5,7 @@ Current protein language models (PLMs) learn protein representations mainly base
 ![image/png](https://cdn-uploads.huggingface.co/production/uploads/62f0a673f0d40f6aae296b4a/o4F5-Cm-gGdHPpX5rPVKx.png)
 ## Example
-The following script shows how to run ProtST with [optimum-intel](https://github.com/huggingface/optimum-intel) optimization on zero-shot classification task.
 ```diff
 import logging
 import functools
@@ -13,14 +13,16 @@ from tqdm import tqdm
 import torch
 from datasets import load_dataset
 from transformers import AutoModel, AutoTokenizer, AutoConfig
 logger = logging.getLogger(__name__)
 def tokenize_protein(example, protein_tokenizer=None, padding=None):
     protein_seqs = example["prot_seq"]
-    protein_inputs = protein_tokenizer(protein_seqs, padding=padding, add_special_tokens=True)
     example["protein_input_ids"] = protein_inputs.input_ids
     example["protein_attention_mask"] = protein_inputs.attention_mask
@@ -33,7 +35,8 @@ def label_embedding(labels, text_tokenizer, text_model, device):
     with torch.inference_mode():
         for label in labels:
             label_input_ids = text_tokenizer.encode(label, max_length=128,
-                                                    truncation=True, add_special_tokens=False)
             label_input_ids = [text_tokenizer.cls_token_id] + label_input_ids
             label_input_ids = torch.tensor(label_input_ids, dtype=torch.long, device=device).unsqueeze(0)
             attention_mask = label_input_ids != text_tokenizer.pad_token_id
@@ -41,7 +44,8 @@ def label_embedding(labels, text_tokenizer, text_model, device):
             text_outputs = text_model(label_input_ids, attention_mask=attention_mask)
-            label_feature.append(text_outputs["text_feature"])
     label_feature = torch.cat(label_feature, dim=0)
     label_feature = label_feature / label_feature.norm(dim=-1, keepdim=True)
@@ -82,10 +86,6 @@ if __name__ == "__main__":
     protst_model = AutoModel.from_pretrained("mila-intel/ProtST-esm1b", trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
     protein_model = protst_model.protein_model
-+   import intel_extension_for_pytorch as ipex
-+   from optimum.intel.generation.modeling import jit_trace
-+   protein_model = ipex.optimize(protein_model, dtype=torch.bfloat16, inplace=True)
-+   protein_model = jit_trace(protein_model, "sequence-classification")
     text_model = protst_model.text_model
     logit_scale = protst_model.logit_scale
     logit_scale.requires_grad = False
@@ -108,4 +108,16 @@ if __name__ == "__main__":
     label_feature = label_embedding(labels, text_tokenizer, text_model, device)
     zero_shot_eval(logger, device, test_dataset, "localization",
                    protein_model, logit_scale, label_feature)
 ```

 ![image/png](https://cdn-uploads.huggingface.co/production/uploads/62f0a673f0d40f6aae296b4a/o4F5-Cm-gGdHPpX5rPVKx.png)
 ## Example
+The following script shows how to run ProtST with Gaudi and [optimum-intel](https://github.com/huggingface/optimum-intel) optimization on zero-shot classification task.
 ```diff
 import logging
 import functools
 import torch
 from datasets import load_dataset
 from transformers import AutoModel, AutoTokenizer, AutoConfig
++ import habana_frameworks.torch
 logger = logging.getLogger(__name__)
 def tokenize_protein(example, protein_tokenizer=None, padding=None):
     protein_seqs = example["prot_seq"]
+-   protein_inputs = protein_tokenizer(protein_seqs, padding=padding, add_special_tokens=True)
++   protein_inputs = protein_tokenizer(protein_seqs, padding="max_length", truncation=True, add_special_tokens=True, max_length=1024)
     example["protein_input_ids"] = protein_inputs.input_ids
     example["protein_attention_mask"] = protein_inputs.attention_mask
     with torch.inference_mode():
         for label in labels:
             label_input_ids = text_tokenizer.encode(label, max_length=128,
+-                                                   truncation=True, add_special_tokens=False)
++                                                   truncation=True, add_special_tokens=False, padding="max_length")
             label_input_ids = [text_tokenizer.cls_token_id] + label_input_ids
             label_input_ids = torch.tensor(label_input_ids, dtype=torch.long, device=device).unsqueeze(0)
             attention_mask = label_input_ids != text_tokenizer.pad_token_id
             text_outputs = text_model(label_input_ids, attention_mask=attention_mask)
+-           label_feature.append(text_outputs["text_feature"])
++           label_feature.append(text_outputs["text_feature"].clone())
     label_feature = torch.cat(label_feature, dim=0)
     label_feature = label_feature / label_feature.norm(dim=-1, keepdim=True)
     protst_model = AutoModel.from_pretrained("mila-intel/ProtST-esm1b", trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
     protein_model = protst_model.protein_model
     text_model = protst_model.text_model
     logit_scale = protst_model.logit_scale
     logit_scale.requires_grad = False
     label_feature = label_embedding(labels, text_tokenizer, text_model, device)
     zero_shot_eval(logger, device, test_dataset, "localization",
                    protein_model, logit_scale, label_feature)
+```
+Run ProtST on CPU with [optimum-intel](https://github.com/huggingface/optimum-intel) optimization.
+```diff
+...
+    protst_model = AutoModel.from_pretrained("mila-intel/ProtST-esm1b", trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
+    protein_model = protst_model.protein_model
++   import intel_extension_for_pytorch as ipex
++   from optimum.intel.generation.modeling import jit_trace
++   protein_model = ipex.optimize(protein_model, dtype=torch.bfloat16, inplace=True)
++   protein_model = jit_trace(protein_model, "sequence-classification")
+...
 ```