File size: 3,228 Bytes
d50df92
7cb3770
d50df92
edc874f
 
 
 
 
d50df92
7cb3770
 
 
 
 
 
 
 
 
 
d50df92
edc874f
 
 
 
 
 
 
 
 
 
 
 
d50df92
7cb3770
 
 
 
 
edc874f
 
 
7cb3770
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edc874f
7cb3770
 
 
edc874f
 
 
7cb3770
 
 
 
edc874f
7cb3770
edc874f
 
 
 
 
7cb3770
edc874f
ee2690e
 
 
 
 
edc874f
7cb3770
d50df92
7cb3770
edc874f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
import numpy as np
import spaces
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from rdkit import Chem
from rdkit.Chem import Draw
from PIL import Image
import io

# Global variables to store model and tokenizer
model = None
tokenizer = None

def load_model(model_path):
    """Load the fine-tuned model and tokenizer from Hugging Face"""
    global model, tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    print(f"Model loaded from {model_path}")

def draw_molecule(smiles):
    """Draw molecule structure from SMILES"""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        img = Draw.MolToImage(mol, size=(400, 400))
        return img
    except Exception as e:
        print(f"Error drawing molecule: {e}")
        return None

@spaces.GPU
def predict(input_text):
        """Make prediction on the input text directly without creating a dataset"""
        if model is None or tokenizer is None:
            return "Error: Model not loaded"
        
        # Draw molecule structure
        mol_image = draw_molecule(input_text)

        model.to('cuda')
        # Tokenize input directly
        inputs = tokenizer(input_text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        
        # Move input tensors to GPU
        inputs = {k: v.to('cuda') for k, v in inputs.items()}

        # Get model predictions
        outputs = model(**inputs)
        logits = outputs.logits.detach().cpu().numpy()
        
        # Stable softmax to get probabilities
        exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
        probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
        
        # Get predicted label
        pred_label = np.argmax(probs, axis=1)[0]
        # Map prediction to label
        label_map = {0: "Unnatural", 1: "Natural"}
        pred_label_text = label_map[pred_label]
        
        # Format output
        result = f"Type: {pred_label_text}\n"
        natural_prob = probs[0][1] if pred_label == 1 else 1 - probs[0][0]
        result += f"Natural Product Probability: {natural_prob:.4f}\n"
        
        return mol_image, result



# Load model on initialization
load_model("shulik7/NP_SMILES_tokenized_PubChem_shard00_160k")  
# Create Gradio interface
demo = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(lines=5, placeholder="Enter the SMILES here...", label="SMILES Input"),
    outputs=[
        gr.Image(label="Molecule Structure", type="pil"),
        gr.Textbox(lines=5, placeholder="Prediction results will appear here...", label="Prediction")
    ],
    title="Naturalness Prediction",
    description="Enter SMILES string to get the prediction from the fine-tuned ChemBERTa model.",
    examples=[
        ["CN1C=NC2=C1C(=O)N(C(=O)N2C)C"],  # Caffeine - natural product
        ["CC(C)NCC(COC1=CC=C(C=C1)COCCOC(C)C)O"],  # Atenolol - synthetic
        ["CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C"]  # Retinoic acid - natural product
    ],
    flagging_mode="never"
)

if __name__ == "__main__":
    demo.launch()