ElnaggarLab
/

ankh3-large

Feature Extraction

text2text-generation

protein language model

Model card Files Files and versions

hazemessam commited on May 14, 2025

Commit

13b5d09

·

verified ·

1 Parent(s): e7cc24d

Update README.md

Files changed (1) hide show

README.md +13 -3

README.md CHANGED Viewed

@@ -51,13 +51,23 @@ Ankh3 is a protein language model that is jointly optimized on two objectives:
 from transformers import T5ForConditionalGeneration, T5Tokenizer, T5EncoderModel
 import torch
 sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
-ckpt = "proteinea-ea/ankh3-large"
 tokenizer = T5Tokenizer.from_pretrained(ckpt)
 # To use the encoder representation using the NLU prefix:
 encoder_model = T5EncoderModel.from_pretrained(ckpt).eval()
-# Random sequence from uniprot, most likely Ankh3 saw it during pre-training.
 nlu_sequence = "[NLU]" + sequence
 encoded_nlu_sequence = tokenizer(nlu_sequence, add_special_tokens=True, return_tensors="pt", is_split_into_words=False)
@@ -73,7 +83,7 @@ import torch
 sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
-ckpt = "proteinea-ea/ankh3-large"
 tokenizer = T5Tokenizer.from_pretrained(ckpt)
 # To use the sequence to sequence task using the S2S prefix:
 model = T5ForConditionalGeneration.from_pretrained(ckpt).eval()

 from transformers import T5ForConditionalGeneration, T5Tokenizer, T5EncoderModel
 import torch
+# Random sequence from uniprot, most likely Ankh3 saw it during pre-training.
 sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
+ckpt = "ElnaggarLab/ankh3-large"
+# Make sure that you must use `T5Tokenizer` not `AutoTokenizer`.
 tokenizer = T5Tokenizer.from_pretrained(ckpt)
 # To use the encoder representation using the NLU prefix:
 encoder_model = T5EncoderModel.from_pretrained(ckpt).eval()
+# For extracting embeddings, consider trying the '[S2S]' prefix.
+# Since this prefix was specifically used to denote sequence completion
+# during the model's pre-training, its use can sometimes
+# lead to improved embedding quality.
 nlu_sequence = "[NLU]" + sequence
 encoded_nlu_sequence = tokenizer(nlu_sequence, add_special_tokens=True, return_tensors="pt", is_split_into_words=False)
 sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
+ckpt = "ElnaggarLab/ankh3-large"
 tokenizer = T5Tokenizer.from_pretrained(ckpt)
 # To use the sequence to sequence task using the S2S prefix:
 model = T5ForConditionalGeneration.from_pretrained(ckpt).eval()