protgpt3
/

ProtGPT3-MSA

Text Generation

protein-language-model

protein-generation

multiple-sequence-alignment

few-shot-prompting

homolog-conditioned-generation

mixture-of-experts

text-generation-inference

Model card Files Files and versions

protgpt3 commited on 13 days ago

Commit

bf4fca6

·

verified ·

1 Parent(s): 376da1e

Update README.md

Files changed (1) hide show

README.md +3 -3

README.md CHANGED Viewed

@@ -90,7 +90,7 @@ def process_style(seq: str, gap: bool):
         return re.sub(r"[X]", "", seq.replace("-", "").upper())
 def build_prompt(
-    sequences: List[str],
     gap: bool = False,
 ) -> str:
     """Build prompt for ProtGPT3-MSA"""
@@ -118,7 +118,7 @@ def build_prompt(
 model_id = "protgpt3/ProtGPT3-MSA"  # Replace with the final checkpoint name
 # Load tokenizer for generation
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True,add_bos_token=True, add_eos_token=False)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
@@ -178,7 +178,7 @@ aligned_homologs = [
     "MKTAYIAKQRQINNSFVKSHFSRQNILD",
 ]
-prompt = build_prompt(sequences=homologs, gap=True)
 inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

         return re.sub(r"[X]", "", seq.replace("-", "").upper())
 def build_prompt(
+    sequences: list,
     gap: bool = False,
 ) -> str:
     """Build prompt for ProtGPT3-MSA"""
 model_id = "protgpt3/ProtGPT3-MSA"  # Replace with the final checkpoint name
 # Load tokenizer for generation
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True,add_bos_token=False, add_eos_token=False) # BOS token manually added in build_prompt
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     "MKTAYIAKQRQINNSFVKSHFSRQNILD",
 ]
+prompt = build_prompt(sequences=aligned_homologs, gap=True)
 inputs = tokenizer(prompt, return_tensors="pt").to(model.device)