Update README.md
Browse files
README.md
CHANGED
|
@@ -4,6 +4,27 @@ This is a Roberta style masked language model trained on ~480m SMILES strings fr
|
|
| 4 |
The model has ~102m parameters and was trained for 150000 iterations with a batch size of 4096 to a validation loss of ~0.122.
|
| 5 |
This model is useful for generating embeddings from SMILES strings.
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
---
|
| 8 |
license: mit
|
| 9 |
---
|
|
|
|
| 4 |
The model has ~102m parameters and was trained for 150000 iterations with a batch size of 4096 to a validation loss of ~0.122.
|
| 5 |
This model is useful for generating embeddings from SMILES strings.
|
| 6 |
|
| 7 |
+
```python
|
| 8 |
+
from transformers import RobertaTokenizerFast, RobertaForMaskedLM, DataCollatorWithPadding
|
| 9 |
+
|
| 10 |
+
tokenizer = RobertaTokenizerFast.from_pretrained("entropy/roberta_zinc_480m", max_len=128)
|
| 11 |
+
model = RobertaForMaskedLM.from_pretrained('entropy/roberta_zinc_480m')
|
| 12 |
+
collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt')
|
| 13 |
+
|
| 14 |
+
smiles = ['Brc1cc2c(NCc3ccccc3)ncnc2s1',
|
| 15 |
+
'Brc1cc2c(NCc3ccccn3)ncnc2s1',
|
| 16 |
+
'Brc1cc2c(NCc3cccs3)ncnc2s1',
|
| 17 |
+
'Brc1cc2c(NCc3ccncc3)ncnc2s1',
|
| 18 |
+
'Brc1cc2c(Nc3ccccc3)ncnc2s1']
|
| 19 |
+
|
| 20 |
+
inputs = collator(tokenizer(smiles))
|
| 21 |
+
outputs = model(**inputs, output_hidden_states=True)
|
| 22 |
+
full_embeddings = outputs[1][-1]
|
| 23 |
+
mask = inputs['attention_mask']
|
| 24 |
+
embeddings = ((full_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1))
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
|
| 28 |
---
|
| 29 |
license: mit
|
| 30 |
---
|