import sys import os sys.path.append('/scratch/pranamlab/tong/ReDi_discrete/smiles') import xgboost as xgb import torch import numpy as np from transformers import AutoModelForMaskedLM from smiles_tokenizer.my_tokenizers import SMILES_SPE_Tokenizer import numpy as np from rdkit import Chem, rdBase, DataStructs class Analyzer: def __init__(self, device): self.device = device def get_scores(self, input_seqs): """Check if the SMILES represents a peptide structure""" results = [] for smiles in input_seqs: mol = Chem.MolFromSmiles(smiles) if mol is None: results.append(0) continue # Look for peptide bonds: NC(=O) pattern peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)') # Look for N-methylated peptide bonds: N(C)C(=O) pattern n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)') if mol.HasSubstructMatch(peptide_bond_pattern) or mol.HasSubstructMatch(n_methyl_pattern): results.append(1) else: results.append(0) return results def __call__(self, input_seqs): scores = self.get_scores(input_seqs) return torch.tensor(scores)