File size: 1,307 Bytes

295b1cd

import sys
import os
sys.path.append('/scratch/pranamlab/tong/ReDi_discrete/smiles')
import xgboost as xgb
import torch
import numpy as np
from transformers import AutoModelForMaskedLM
from smiles_tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
import numpy as np
from rdkit import Chem, rdBase, DataStructs

class Analyzer:

    def __init__(self, device):
        self.device = device
    
    def get_scores(self, input_seqs):
        """Check if the SMILES represents a peptide structure"""
        results = []

        for smiles in input_seqs:
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                results.append(0)
                continue
                
            # Look for peptide bonds: NC(=O) pattern
            peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')

            # Look for N-methylated peptide bonds: N(C)C(=O) pattern
            n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')

            if mol.HasSubstructMatch(peptide_bond_pattern) or mol.HasSubstructMatch(n_methyl_pattern):
                results.append(1)
            else:
                results.append(0)

        return results
    
    def __call__(self, input_seqs):
        scores = self.get_scores(input_seqs)
        return torch.tensor(scores)