|
|
import sys |
|
|
import os |
|
|
sys.path.append('/scratch/pranamlab/tong/ReDi_discrete/smiles') |
|
|
import xgboost as xgb |
|
|
import torch |
|
|
import numpy as np |
|
|
from transformers import AutoModelForMaskedLM |
|
|
from smiles_tokenizer.my_tokenizers import SMILES_SPE_Tokenizer |
|
|
import numpy as np |
|
|
from rdkit import Chem, rdBase, DataStructs |
|
|
|
|
|
class Analyzer: |
|
|
|
|
|
def __init__(self, device): |
|
|
self.device = device |
|
|
|
|
|
def get_scores(self, input_seqs): |
|
|
"""Check if the SMILES represents a peptide structure""" |
|
|
results = [] |
|
|
|
|
|
for smiles in input_seqs: |
|
|
mol = Chem.MolFromSmiles(smiles) |
|
|
if mol is None: |
|
|
results.append(0) |
|
|
continue |
|
|
|
|
|
|
|
|
peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)') |
|
|
|
|
|
|
|
|
n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)') |
|
|
|
|
|
if mol.HasSubstructMatch(peptide_bond_pattern) or mol.HasSubstructMatch(n_methyl_pattern): |
|
|
results.append(1) |
|
|
else: |
|
|
results.append(0) |
|
|
|
|
|
return results |
|
|
|
|
|
def __call__(self, input_seqs): |
|
|
scores = self.get_scores(input_seqs) |
|
|
return torch.tensor(scores) |