File size: 1,307 Bytes
295b1cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import sys
import os
sys.path.append('/scratch/pranamlab/tong/ReDi_discrete/smiles')
import xgboost as xgb
import torch
import numpy as np
from transformers import AutoModelForMaskedLM
from smiles_tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
import numpy as np
from rdkit import Chem, rdBase, DataStructs
class Analyzer:
def __init__(self, device):
self.device = device
def get_scores(self, input_seqs):
"""Check if the SMILES represents a peptide structure"""
results = []
for smiles in input_seqs:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
results.append(0)
continue
# Look for peptide bonds: NC(=O) pattern
peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')
# Look for N-methylated peptide bonds: N(C)C(=O) pattern
n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')
if mol.HasSubstructMatch(peptide_bond_pattern) or mol.HasSubstructMatch(n_methyl_pattern):
results.append(1)
else:
results.append(0)
return results
def __call__(self, input_seqs):
scores = self.get_scores(input_seqs)
return torch.tensor(scores) |