In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import selfies as sf
from rdkit import Chem
from typing import Optional
import numpy as np
import py3Dmol
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import torch

def smiles_to_3d(smiles_list, width=400, height=300):
    # Visualize the 3D structure using py3Dmol
    view = py3Dmol.view(width=width, height=height)
    for smiles in smiles_list:
        # Generate the RDKit molecule object
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError("Invalid SMILES string")

        # Add hydrogens to the molecule
        mol = Chem.AddHs(mol)

        # Generate 3D coordinates
        AllChem.EmbedMolecule(mol, randomSeed=42)
        AllChem.UFFOptimizeMolecule(mol)

        # Generate the 3D structure in the form of a pdb string
        pdb = Chem.MolToPDBBlock(mol)
        view.addModel(pdb, 'pdb')
        view.setStyle({'stick': {}})
        view.zoomTo()
    return view

    
# Load the checkpoint and the tokenizer
checkpoint_path = "lamthuy/SelfiesGen"
model = AutoModelForCausalLM.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

In [9]:
# Given a SMILES, get its fingerpint
smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"
smiles_to_3d([smiles])

<py3Dmol.view at 0x2afd09c40>

In [12]:
s = sf.encoder(smiles)
s = s + "[SEP]"
print(s)
input_ids = tokenizer.encode(s, return_tensors="pt")
n = input_ids.size(1)
# Generate output sequence
output_ids = model.generate(input_ids, max_length=128, num_beams=5, num_return_sequences=5,
                            early_stopping=True)
output = tokenizer.decode(output_ids[1][n:], skip_special_tokens=True)
print(output)
smiles = sf.decoder(output)
print(smiles)
smiles_to_3d([smiles])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[C][C][=Branch1][C][=O][O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=Branch1][C][=S][O][SEP]
[C][C][=Branch1][C][=O][O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=Branch1][C][=NH2+1][O]
CC(=O)OC1=CC=CC=C1C(=[NH2+1])O


<py3Dmol.view at 0x2afbaf020>

In [13]:
input_ids[0][5] = tokenizer.mask_token_id
input_ids[0][9] = tokenizer.mask_token_id
input_ids[0][18] = tokenizer.mask_token_id
input_ids[0][11] = tokenizer.mask_token_id
# Generate output sequence
output_ids = model.generate(input_ids, max_length=128, num_beams=5, num_return_sequences=5,
                            early_stopping=True)
output = tokenizer.decode(output_ids[1][n:], skip_special_tokens=True)
print(output)
smiles = sf.decoder(output)
print(smiles)
smiles_to_3d([smiles])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[C][C][=Branch1][C][=O][N][C][=C][C][=N][C][=C][Ring1][=Branch1][C][=Branch1][C][=S][O-1]
CC(=O)NC1=CC=NC=C1C(=S)[O-1]


<py3Dmol.view at 0x11fd5a780>