File size: 1,398 Bytes
21f308b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn

polymer2psmiles = {
    'PHB': '[*]OC(C)CC(=O)[*]',  
    'PCL': '[*]OCCCCCC(=O)[*]', 
    'PVA': '[*]C(O)C[*]',      
    'PPL': '[*]CCC(=O)O[*]',  
    'P3HP': '[*]OCCC(=O)[*]', 
    'P4HB': '[*]C(=O)CCCO[*]',
    'PEA': '[*]OCCOC(=O)CCCCC(=O)[*]', 
    'PES': '[*]OCCOC(=O)CCC(=O)[*]',  
    'O-PVA': '[*]C(=O)C[*]', 
    'PBS': '[*]C(=O)CCC(=O)OCCCCO[*]', 
    'PLA': '[*]C(C)C(=O)O[*]', 
    'PEG': '[*]CCO[*]', 
    'PBSA': '[*]C(=O)CCC(=O)OCCCCOC(=O)CCC(=O)[*]', 
    'PET': '[*]CCOC(=O)c1ccc(C(=O)O[*])cc1', 
    'PE': '[*]CC[*]',
    'PMCL': '[*]C(=O)CCC(C)CCO[*]', 
    'PEF': '[*]OC(=O)c1oc(C(=O)OCC[*])cc1', 
    'PS': '[*]C(c1ccccc1)C[*]', 
    'NR': '[*]CC(C)=CC[*]', 
    'PHV': '[*]OC(CC)CC(=O)[*]',
}


class PolyEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained('kuelumbus/polyBERT')
        self.polyBERT = AutoModel.from_pretrained('kuelumbus/polyBERT')

    def forward(self, psmiles_strings):
        assert len(psmiles_strings) == 1, "Batch size must be 1 for PolyEncoder"
        encoded_input = self.tokenizer(
            psmiles_strings, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = self.polyBERT(**encoded_input)
        return model_output[0]