Tong Chen
add files
295b1cd
import sys
import os
sys.path.append('/scratch/pranamlab/tong/ReDi_discrete/smiles')
import xgboost as xgb
import torch
import numpy as np
from transformers import AutoModelForMaskedLM
from smiles_tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
import numpy as np
from rdkit import Chem, rdBase, DataStructs
class Analyzer:
def __init__(self, device):
self.device = device
def get_scores(self, input_seqs):
"""Check if the SMILES represents a peptide structure"""
results = []
for smiles in input_seqs:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
results.append(0)
continue
# Look for peptide bonds: NC(=O) pattern
peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')
# Look for N-methylated peptide bonds: N(C)C(=O) pattern
n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')
if mol.HasSubstructMatch(peptide_bond_pattern) or mol.HasSubstructMatch(n_methyl_pattern):
results.append(1)
else:
results.append(0)
return results
def __call__(self, input_seqs):
scores = self.get_scores(input_seqs)
return torch.tensor(scores)