protgpt3
/

ProtGPT3-MSA

@@ -76,10 +76,50 @@ Load the model and tokenizer:
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 model_id = "protgpt3/ProtGPT3-MSA"  # Replace with the final checkpoint name
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     torch_dtype=torch.bfloat16,
@@ -97,6 +137,7 @@ Use the `<no_gap>` modality token for unaligned sequences. Separate homologous s
 ```python
 import torch
 homologs = [
     "MKTAYIAKQRQISFVKSHFSRQDILD",
     "MKTVYIAKQRQISFVKSHFSRQDILD",
@@ -104,7 +145,7 @@ homologs = [
     # Add up to 15 homologous protein sequences
 ]
-prompt = "<no_gap>" + "<s>".join(homologs) + "<s>"
 inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
@@ -130,13 +171,14 @@ Use the `<gap>` modality token for aligned sequences. Gap characters may be incl
 ```python
 import torch
 aligned_homologs = [
     "MKTAYIAKQRQI--SFVKSHFSRQDILD",
     "MKTVYIAKQRQI--SFVKSHFSRQDILD",
     "MKTAYIAKQRQINNSFVKSHFSRQNILD",
 ]
-prompt = "<gap>" + "<s>".join(aligned_homologs) + "<s>"
 inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+import random
+import re
+# ---- Intialise useful methods to prompt ProtGPT3-MSA ----
+def process_style(seq: str, gap: bool):
+    """Remove gaps, uppercase insertions, drop X."""
+    if gap:
+        # keep gaps
+        return re.sub(r"[X]", "", seq.upper())
+    else:
+        # remove gaps
+        return re.sub(r"[X]", "", seq.replace("-", "").upper())
+def build_prompt(
+    sequences: List[str],
+    gap: bool = False,
+) -> str:
+    """Build prompt for ProtGPT3-MSA"""
+    random.shuffle(sequences)
+    direction = "1" # change this to "2" for reversed C-to-N generation
+    if gap:
+        gap_token = "<gap>"
+        assert all(len(s) == len(sequences[0]) for s in sequences), "Sequences in the prompt have different len(), but should be aligned, either align them or use no_gap mode"
+    else:
+        gap_token = "<no_gap>"
+    tokens: List[str] = ["<|bos|>", direction, gap_token]
+    for seq in sequences:
+        tokens.append("<s>")
+        tokens.extend(list(process_style(seq,gap=gap)))
+    # Match train-time separator before continuation
+    tokens.append("<s>")
+    return " ".join(tokens)
+## --------------------------------------
 model_id = "protgpt3/ProtGPT3-MSA"  # Replace with the final checkpoint name
+# Load tokenizer for generation
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True,add_bos_token=True, add_eos_token=False)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     torch_dtype=torch.bfloat16,
 ```python
 import torch
 homologs = [
     "MKTAYIAKQRQISFVKSHFSRQDILD",
     "MKTVYIAKQRQISFVKSHFSRQDILD",
     # Add up to 15 homologous protein sequences
 ]
+prompt = build_prompt(sequences=homologs)
 inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 ```python
 import torch
+# must have the same length and be aligned
 aligned_homologs = [
     "MKTAYIAKQRQI--SFVKSHFSRQDILD",
     "MKTVYIAKQRQI--SFVKSHFSRQDILD",
     "MKTAYIAKQRQINNSFVKSHFSRQNILD",
 ]
+prompt = build_prompt(sequences=homologs, gap=True)
 inputs = tokenizer(prompt, return_tensors="pt").to(model.device)