Updated README.md to fix minor errors and update to be in line with new huggingface v5

Updated README.md to be in line with huggingface v5: https://github.com/huggingface/transformers/releases/tag/v5.0.0

Also fixed minor syntactic error:
tokenizer.batch_encode_plus(**sequences_example**, ...

min_len = min([ len(s) for s in **folding_example**])
max_len = max([ len(s) for s in **folding_example**])

etc.

Files changed (1) hide show

README.md +22 -21

README.md CHANGED Viewed

@@ -46,10 +46,11 @@ Feature extraction:
 ```python
 from transformers import T5Tokenizer, T5EncoderModel
 import torch
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 # Load the tokenizer
-tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False).to(device)
 # Load the model
 model = T5EncoderModel.from_pretrained("Rostlab/ProstT5").to(device)
@@ -66,12 +67,12 @@ sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequen
 # add pre-fixes accordingly (this already expects 3Di-sequences to be lower-case)
 # if you go from AAs to 3Di (or if you want to embed AAs), you need to prepend "<AA2fold>"
 # if you go from 3Di to AAs (or if you want to embed 3Di), you need to prepend "<fold2AA>"
-sequence_examples = [ "<AA2fold>" + " " + s if s.isupper() else "<fold2AA>" + " " + s
-                      for s in sequence_examples
                     ]
 # tokenize sequences and pad up to the longest sequence in the batch
-ids = tokenizer.batch_encode_plus(sequences_example, add_special_tokens=True, padding="longest",return_tensors='pt').to(device))
 # generate embeddings
 with torch.no_grad():
@@ -81,9 +82,9 @@ with torch.no_grad():
               )
 # extract residue embeddings for the first ([0,:]) sequence in the batch and remove padded & special tokens, incl. prefix ([0,1:8])
-emb_0 = embedding_repr.last_hidden_state[0,1:8] # shape (7 x 1024)
 # same for the second ([1,:]) sequence but taking into account different sequence lengths ([1,:6])
-emb_1 = embedding_repr.last_hidden_state[1,1:6] # shape (5 x 1024)
 # if you want to derive a single representation (per-protein embedding) for the whole protein
 emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)
@@ -93,10 +94,11 @@ Translation ("folding", i.e., AA to 3Di):
 ```python
 from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
 import torch
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 # Load the tokenizer
-tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False).to(device)
 # Load the model
 model = AutoModelForSeq2SeqLM.from_pretrained("Rostlab/ProstT5").to(device)
@@ -108,24 +110,24 @@ model.full() if device=='cpu' else model.half()
 # Amino acid sequences are expected to be upper-case ("PRTEINO" below)
 # while 3Di-sequences need to be lower-case.
 sequence_examples = ["PRTEINO", "SEQWENCE"]
-min_len = min([ len(s) for s in folding_example])
-max_len = max([ len(s) for s in folding_example])
 # replace all rare/ambiguous amino acids by X (3Di sequences does not have those) and introduce white-space between all sequences (AAs and 3Di)
 sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]
 # add pre-fixes accordingly. For the translation from AAs to 3Di, you need to prepend "<AA2fold>"
-sequence_examples = [ "<AA2fold>" + " " + s for s in sequence_examples]
 # tokenize sequences and pad up to the longest sequence in the batch
-ids = tokenizer.batch_encode_plus(sequences_example,
                                   add_special_tokens=True,
                                   padding="longest",
-                                  return_tensors='pt').to(device))
 # Generation configuration for "folding" (AA-->3Di)
 gen_kwargs_aa2fold = {
-                  "do_sample": True,
                   "num_beams": 3,
                   "top_p" : 0.95,
                   "temperature" : 1.2,
@@ -145,20 +147,20 @@ with torch.no_grad():
               **gen_kwargs_aa2fold
   )
 # Decode and remove white-spaces between tokens
-decoded_translations = tokenizer.batch_decode( translations, skip_special_tokens=True )
-structure_sequences = [ "".join(ts.split(" ")) for ts in decoded_translations ] # predicted 3Di strings
 # Now we can use the same model and invert the translation logic
 # to generate an amino acid sequence from the predicted 3Di-sequence (3Di-->AA)
 # add pre-fixes accordingly. For the translation from 3Di to AA (3Di-->AA), you need to prepend "<fold2AA>"
-sequence_examples_backtranslation = [ "<fold2AA>" + " " + s for s in decoded_translations]
 # tokenize sequences and pad up to the longest sequence in the batch
-ids_backtranslation = tokenizer.batch_encode_plus(sequence_examples_backtranslation,
                                   add_special_tokens=True,
                                   padding="longest",
-                                  return_tensors='pt').to(device))
 # Example generation configuration for "inverse folding" (3Di-->AA)
 gen_kwargs_fold2AA = {
@@ -181,9 +183,8 @@ with torch.no_grad():
               **gen_kwargs_fold2AA
   )
 # Decode and remove white-spaces between tokens
-decoded_backtranslations = tokenizer.batch_decode( backtranslations, skip_special_tokens=True )
-aminoAcid_sequences = [ "".join(ts.split(" ")) for ts in decoded_backtranslations ] # predicted amino acid strings
 ```

 ```python
 from transformers import T5Tokenizer, T5EncoderModel
 import torch
+import re
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 # Load the tokenizer
+tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)
 # Load the model
 model = T5EncoderModel.from_pretrained("Rostlab/ProstT5").to(device)
 # add pre-fixes accordingly (this already expects 3Di-sequences to be lower-case)
 # if you go from AAs to 3Di (or if you want to embed AAs), you need to prepend "<AA2fold>"
 # if you go from 3Di to AAs (or if you want to embed 3Di), you need to prepend "<fold2AA>"
+sequence_examples = ["<AA2fold>" + " " + s if s.isupper() else "<fold2AA>" + " " + s
+                        for s in sequence_examples
                     ]
 # tokenize sequences and pad up to the longest sequence in the batch
+ids = tokenizer(sequence_examples, add_special_tokens=True, padding="longest",return_tensors='pt').to(device)
 # generate embeddings
 with torch.no_grad():
               )
 # extract residue embeddings for the first ([0,:]) sequence in the batch and remove padded & special tokens, incl. prefix ([0,1:8])
+emb_0 = embedding_rpr.last_hidden_state[0,1:8] # shape (7 x 1024)
 # same for the second ([1,:]) sequence but taking into account different sequence lengths ([1,:6])
+emb_1 = embedding_rpr.last_hidden_state[1,1:6] # shape (5 x 1024)
 # if you want to derive a single representation (per-protein embedding) for the whole protein
 emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)
 ```python
 from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
 import torch
+import re
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 # Load the tokenizer
+tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)
 # Load the model
 model = AutoModelForSeq2SeqLM.from_pretrained("Rostlab/ProstT5").to(device)
 # Amino acid sequences are expected to be upper-case ("PRTEINO" below)
 # while 3Di-sequences need to be lower-case.
 sequence_examples = ["PRTEINO", "SEQWENCE"]
+min_len = min([ len(s) for s in sequence_examples])
+max_len = max([ len(s) for s in sequence_examples])
 # replace all rare/ambiguous amino acids by X (3Di sequences does not have those) and introduce white-space between all sequences (AAs and 3Di)
 sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]
 # add pre-fixes accordingly. For the translation from AAs to 3Di, you need to prepend "<AA2fold>"
+sequence_examples = ["<AA2fold>" + " " + s for s in sequence_examples]
 # tokenize sequences and pad up to the longest sequence in the batch
+ids = tokenizer(sequence_examples,
                                   add_special_tokens=True,
                                   padding="longest",
+                                  return_tensors='pt').to(device)
 # Generation configuration for "folding" (AA-->3Di)
 gen_kwargs_aa2fold = {
+                  "do_sample": True,
                   "num_beams": 3,
                   "top_p" : 0.95,
                   "temperature" : 1.2,
               **gen_kwargs_aa2fold
   )
 # Decode and remove white-spaces between tokens
+decoded_translations = tokenizer.batch_decode(translations, skip_special_tokens=True)
+structure_sequences = ["".join(ts.split(" ")) for ts in decoded_translations] # predicted 3Di strings
 # Now we can use the same model and invert the translation logic
 # to generate an amino acid sequence from the predicted 3Di-sequence (3Di-->AA)
 # add pre-fixes accordingly. For the translation from 3Di to AA (3Di-->AA), you need to prepend "<fold2AA>"
+sequence_examples_backtranslation = ["<fold2AA>" + " " + s for s in decoded_translations]
 # tokenize sequences and pad up to the longest sequence in the batch
+ids_backtranslation = tokenizer(sequence_examples_backtranslation,
                                   add_special_tokens=True,
                                   padding="longest",
+                                  return_tensors='pt').to(device)
 # Example generation configuration for "inverse folding" (3Di-->AA)
 gen_kwargs_fold2AA = {
               **gen_kwargs_fold2AA
   )
 # Decode and remove white-spaces between tokens
+decoded_backtranslations = tokenizer.batch_decode(backtranslations, skip_special_tokens=True)
+aminoAcid_sequences = ["".join(ts.split(" ")) for ts in decoded_backtranslations] # predicted amino acid strings
 ```