from transformers import AutoTokenizer, AutoModel import torch import torch.nn as nn polymer2psmiles = { 'PHB': '[*]OC(C)CC(=O)[*]', 'PCL': '[*]OCCCCCC(=O)[*]', 'PVA': '[*]C(O)C[*]', 'PPL': '[*]CCC(=O)O[*]', 'P3HP': '[*]OCCC(=O)[*]', 'P4HB': '[*]C(=O)CCCO[*]', 'PEA': '[*]OCCOC(=O)CCCCC(=O)[*]', 'PES': '[*]OCCOC(=O)CCC(=O)[*]', 'O-PVA': '[*]C(=O)C[*]', 'PBS': '[*]C(=O)CCC(=O)OCCCCO[*]', 'PLA': '[*]C(C)C(=O)O[*]', 'PEG': '[*]CCO[*]', 'PBSA': '[*]C(=O)CCC(=O)OCCCCOC(=O)CCC(=O)[*]', 'PET': '[*]CCOC(=O)c1ccc(C(=O)O[*])cc1', 'PE': '[*]CC[*]', 'PMCL': '[*]C(=O)CCC(C)CCO[*]', 'PEF': '[*]OC(=O)c1oc(C(=O)OCC[*])cc1', 'PS': '[*]C(c1ccccc1)C[*]', 'NR': '[*]CC(C)=CC[*]', 'PHV': '[*]OC(CC)CC(=O)[*]', } class PolyEncoder(nn.Module): def __init__(self): super().__init__() self.tokenizer = AutoTokenizer.from_pretrained('kuelumbus/polyBERT') self.polyBERT = AutoModel.from_pretrained('kuelumbus/polyBERT') def forward(self, psmiles_strings): assert len(psmiles_strings) == 1, "Batch size must be 1 for PolyEncoder" encoded_input = self.tokenizer( psmiles_strings, padding=True, truncation=True, return_tensors='pt') with torch.no_grad(): model_output = self.polyBERT(**encoded_input) return model_output[0]