Update README.md
Browse files
README.md
CHANGED
|
@@ -51,13 +51,23 @@ Ankh3 is a protein language model that is jointly optimized on two objectives:
|
|
| 51 |
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5EncoderModel
|
| 52 |
import torch
|
| 53 |
|
|
|
|
| 54 |
sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
|
| 55 |
|
| 56 |
-
ckpt = "
|
|
|
|
|
|
|
| 57 |
tokenizer = T5Tokenizer.from_pretrained(ckpt)
|
|
|
|
| 58 |
# To use the encoder representation using the NLU prefix:
|
| 59 |
encoder_model = T5EncoderModel.from_pretrained(ckpt).eval()
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
nlu_sequence = "[NLU]" + sequence
|
| 62 |
encoded_nlu_sequence = tokenizer(nlu_sequence, add_special_tokens=True, return_tensors="pt", is_split_into_words=False)
|
| 63 |
|
|
@@ -73,7 +83,7 @@ import torch
|
|
| 73 |
|
| 74 |
sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
|
| 75 |
|
| 76 |
-
ckpt = "
|
| 77 |
tokenizer = T5Tokenizer.from_pretrained(ckpt)
|
| 78 |
# To use the sequence to sequence task using the S2S prefix:
|
| 79 |
model = T5ForConditionalGeneration.from_pretrained(ckpt).eval()
|
|
|
|
| 51 |
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5EncoderModel
|
| 52 |
import torch
|
| 53 |
|
| 54 |
+
# Random sequence from uniprot, most likely Ankh3 saw it during pre-training.
|
| 55 |
sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
|
| 56 |
|
| 57 |
+
ckpt = "ElnaggarLab/ankh3-large"
|
| 58 |
+
|
| 59 |
+
# Make sure that you must use `T5Tokenizer` not `AutoTokenizer`.
|
| 60 |
tokenizer = T5Tokenizer.from_pretrained(ckpt)
|
| 61 |
+
|
| 62 |
# To use the encoder representation using the NLU prefix:
|
| 63 |
encoder_model = T5EncoderModel.from_pretrained(ckpt).eval()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# For extracting embeddings, consider trying the '[S2S]' prefix.
|
| 67 |
+
# Since this prefix was specifically used to denote sequence completion
|
| 68 |
+
# during the model's pre-training, its use can sometimes
|
| 69 |
+
# lead to improved embedding quality.
|
| 70 |
+
|
| 71 |
nlu_sequence = "[NLU]" + sequence
|
| 72 |
encoded_nlu_sequence = tokenizer(nlu_sequence, add_special_tokens=True, return_tensors="pt", is_split_into_words=False)
|
| 73 |
|
|
|
|
| 83 |
|
| 84 |
sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
|
| 85 |
|
| 86 |
+
ckpt = "ElnaggarLab/ankh3-large"
|
| 87 |
tokenizer = T5Tokenizer.from_pretrained(ckpt)
|
| 88 |
# To use the sequence to sequence task using the S2S prefix:
|
| 89 |
model = T5ForConditionalGeneration.from_pretrained(ckpt).eval()
|