AeneasTews commited on
Commit
a8fa074
·
verified ·
1 Parent(s): d7d097d

Updated README.md to fix minor errors and update to be in line with new huggingface v5

Browse files

Updated README.md to be in line with huggingface v5: https://github.com/huggingface/transformers/releases/tag/v5.0.0

Also fixed minor syntactic error:
tokenizer.batch_encode_plus(**sequences_example**, ...

min_len = min([ len(s) for s in **folding_example**])
max_len = max([ len(s) for s in **folding_example**])

etc.

Files changed (1) hide show
  1. README.md +22 -21
README.md CHANGED
@@ -46,10 +46,11 @@ Feature extraction:
46
  ```python
47
  from transformers import T5Tokenizer, T5EncoderModel
48
  import torch
 
49
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
50
 
51
  # Load the tokenizer
52
- tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False).to(device)
53
 
54
  # Load the model
55
  model = T5EncoderModel.from_pretrained("Rostlab/ProstT5").to(device)
@@ -66,12 +67,12 @@ sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequen
66
  # add pre-fixes accordingly (this already expects 3Di-sequences to be lower-case)
67
  # if you go from AAs to 3Di (or if you want to embed AAs), you need to prepend "<AA2fold>"
68
  # if you go from 3Di to AAs (or if you want to embed 3Di), you need to prepend "<fold2AA>"
69
- sequence_examples = [ "<AA2fold>" + " " + s if s.isupper() else "<fold2AA>" + " " + s
70
- for s in sequence_examples
71
  ]
72
 
73
  # tokenize sequences and pad up to the longest sequence in the batch
74
- ids = tokenizer.batch_encode_plus(sequences_example, add_special_tokens=True, padding="longest",return_tensors='pt').to(device))
75
 
76
  # generate embeddings
77
  with torch.no_grad():
@@ -81,9 +82,9 @@ with torch.no_grad():
81
  )
82
 
83
  # extract residue embeddings for the first ([0,:]) sequence in the batch and remove padded & special tokens, incl. prefix ([0,1:8])
84
- emb_0 = embedding_repr.last_hidden_state[0,1:8] # shape (7 x 1024)
85
  # same for the second ([1,:]) sequence but taking into account different sequence lengths ([1,:6])
86
- emb_1 = embedding_repr.last_hidden_state[1,1:6] # shape (5 x 1024)
87
 
88
  # if you want to derive a single representation (per-protein embedding) for the whole protein
89
  emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)
@@ -93,10 +94,11 @@ Translation ("folding", i.e., AA to 3Di):
93
  ```python
94
  from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
95
  import torch
 
96
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
97
 
98
  # Load the tokenizer
99
- tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False).to(device)
100
 
101
  # Load the model
102
  model = AutoModelForSeq2SeqLM.from_pretrained("Rostlab/ProstT5").to(device)
@@ -108,24 +110,24 @@ model.full() if device=='cpu' else model.half()
108
  # Amino acid sequences are expected to be upper-case ("PRTEINO" below)
109
  # while 3Di-sequences need to be lower-case.
110
  sequence_examples = ["PRTEINO", "SEQWENCE"]
111
- min_len = min([ len(s) for s in folding_example])
112
- max_len = max([ len(s) for s in folding_example])
113
 
114
  # replace all rare/ambiguous amino acids by X (3Di sequences does not have those) and introduce white-space between all sequences (AAs and 3Di)
115
  sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]
116
 
117
  # add pre-fixes accordingly. For the translation from AAs to 3Di, you need to prepend "<AA2fold>"
118
- sequence_examples = [ "<AA2fold>" + " " + s for s in sequence_examples]
119
 
120
  # tokenize sequences and pad up to the longest sequence in the batch
121
- ids = tokenizer.batch_encode_plus(sequences_example,
122
  add_special_tokens=True,
123
  padding="longest",
124
- return_tensors='pt').to(device))
125
 
126
  # Generation configuration for "folding" (AA-->3Di)
127
  gen_kwargs_aa2fold = {
128
- "do_sample": True,
129
  "num_beams": 3,
130
  "top_p" : 0.95,
131
  "temperature" : 1.2,
@@ -145,20 +147,20 @@ with torch.no_grad():
145
  **gen_kwargs_aa2fold
146
  )
147
  # Decode and remove white-spaces between tokens
148
- decoded_translations = tokenizer.batch_decode( translations, skip_special_tokens=True )
149
- structure_sequences = [ "".join(ts.split(" ")) for ts in decoded_translations ] # predicted 3Di strings
150
 
151
  # Now we can use the same model and invert the translation logic
152
  # to generate an amino acid sequence from the predicted 3Di-sequence (3Di-->AA)
153
 
154
  # add pre-fixes accordingly. For the translation from 3Di to AA (3Di-->AA), you need to prepend "<fold2AA>"
155
- sequence_examples_backtranslation = [ "<fold2AA>" + " " + s for s in decoded_translations]
156
 
157
  # tokenize sequences and pad up to the longest sequence in the batch
158
- ids_backtranslation = tokenizer.batch_encode_plus(sequence_examples_backtranslation,
159
  add_special_tokens=True,
160
  padding="longest",
161
- return_tensors='pt').to(device))
162
 
163
  # Example generation configuration for "inverse folding" (3Di-->AA)
164
  gen_kwargs_fold2AA = {
@@ -181,9 +183,8 @@ with torch.no_grad():
181
  **gen_kwargs_fold2AA
182
  )
183
  # Decode and remove white-spaces between tokens
184
- decoded_backtranslations = tokenizer.batch_decode( backtranslations, skip_special_tokens=True )
185
- aminoAcid_sequences = [ "".join(ts.split(" ")) for ts in decoded_backtranslations ] # predicted amino acid strings
186
-
187
  ```
188
 
189
 
 
46
  ```python
47
  from transformers import T5Tokenizer, T5EncoderModel
48
  import torch
49
+ import re
50
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
51
 
52
  # Load the tokenizer
53
+ tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)
54
 
55
  # Load the model
56
  model = T5EncoderModel.from_pretrained("Rostlab/ProstT5").to(device)
 
67
  # add pre-fixes accordingly (this already expects 3Di-sequences to be lower-case)
68
  # if you go from AAs to 3Di (or if you want to embed AAs), you need to prepend "<AA2fold>"
69
  # if you go from 3Di to AAs (or if you want to embed 3Di), you need to prepend "<fold2AA>"
70
+ sequence_examples = ["<AA2fold>" + " " + s if s.isupper() else "<fold2AA>" + " " + s
71
+ for s in sequence_examples
72
  ]
73
 
74
  # tokenize sequences and pad up to the longest sequence in the batch
75
+ ids = tokenizer(sequence_examples, add_special_tokens=True, padding="longest",return_tensors='pt').to(device)
76
 
77
  # generate embeddings
78
  with torch.no_grad():
 
82
  )
83
 
84
  # extract residue embeddings for the first ([0,:]) sequence in the batch and remove padded & special tokens, incl. prefix ([0,1:8])
85
+ emb_0 = embedding_rpr.last_hidden_state[0,1:8] # shape (7 x 1024)
86
  # same for the second ([1,:]) sequence but taking into account different sequence lengths ([1,:6])
87
+ emb_1 = embedding_rpr.last_hidden_state[1,1:6] # shape (5 x 1024)
88
 
89
  # if you want to derive a single representation (per-protein embedding) for the whole protein
90
  emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)
 
94
  ```python
95
  from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
96
  import torch
97
+ import re
98
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
99
 
100
  # Load the tokenizer
101
+ tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)
102
 
103
  # Load the model
104
  model = AutoModelForSeq2SeqLM.from_pretrained("Rostlab/ProstT5").to(device)
 
110
  # Amino acid sequences are expected to be upper-case ("PRTEINO" below)
111
  # while 3Di-sequences need to be lower-case.
112
  sequence_examples = ["PRTEINO", "SEQWENCE"]
113
+ min_len = min([ len(s) for s in sequence_examples])
114
+ max_len = max([ len(s) for s in sequence_examples])
115
 
116
  # replace all rare/ambiguous amino acids by X (3Di sequences does not have those) and introduce white-space between all sequences (AAs and 3Di)
117
  sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]
118
 
119
  # add pre-fixes accordingly. For the translation from AAs to 3Di, you need to prepend "<AA2fold>"
120
+ sequence_examples = ["<AA2fold>" + " " + s for s in sequence_examples]
121
 
122
  # tokenize sequences and pad up to the longest sequence in the batch
123
+ ids = tokenizer(sequence_examples,
124
  add_special_tokens=True,
125
  padding="longest",
126
+ return_tensors='pt').to(device)
127
 
128
  # Generation configuration for "folding" (AA-->3Di)
129
  gen_kwargs_aa2fold = {
130
+ "do_sample": True,
131
  "num_beams": 3,
132
  "top_p" : 0.95,
133
  "temperature" : 1.2,
 
147
  **gen_kwargs_aa2fold
148
  )
149
  # Decode and remove white-spaces between tokens
150
+ decoded_translations = tokenizer.batch_decode(translations, skip_special_tokens=True)
151
+ structure_sequences = ["".join(ts.split(" ")) for ts in decoded_translations] # predicted 3Di strings
152
 
153
  # Now we can use the same model and invert the translation logic
154
  # to generate an amino acid sequence from the predicted 3Di-sequence (3Di-->AA)
155
 
156
  # add pre-fixes accordingly. For the translation from 3Di to AA (3Di-->AA), you need to prepend "<fold2AA>"
157
+ sequence_examples_backtranslation = ["<fold2AA>" + " " + s for s in decoded_translations]
158
 
159
  # tokenize sequences and pad up to the longest sequence in the batch
160
+ ids_backtranslation = tokenizer(sequence_examples_backtranslation,
161
  add_special_tokens=True,
162
  padding="longest",
163
+ return_tensors='pt').to(device)
164
 
165
  # Example generation configuration for "inverse folding" (3Di-->AA)
166
  gen_kwargs_fold2AA = {
 
183
  **gen_kwargs_fold2AA
184
  )
185
  # Decode and remove white-spaces between tokens
186
+ decoded_backtranslations = tokenizer.batch_decode(backtranslations, skip_special_tokens=True)
187
+ aminoAcid_sequences = ["".join(ts.split(" ")) for ts in decoded_backtranslations] # predicted amino acid strings
 
188
  ```
189
 
190