Spaces:
Sleeping
Sleeping
File size: 3,228 Bytes
d50df92 7cb3770 d50df92 edc874f d50df92 7cb3770 d50df92 edc874f d50df92 7cb3770 edc874f 7cb3770 edc874f 7cb3770 edc874f 7cb3770 edc874f 7cb3770 edc874f 7cb3770 edc874f ee2690e edc874f 7cb3770 d50df92 7cb3770 edc874f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import gradio as gr
import numpy as np
import spaces
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from rdkit import Chem
from rdkit.Chem import Draw
from PIL import Image
import io
# Global variables to store model and tokenizer
model = None
tokenizer = None
def load_model(model_path):
"""Load the fine-tuned model and tokenizer from Hugging Face"""
global model, tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
print(f"Model loaded from {model_path}")
def draw_molecule(smiles):
"""Draw molecule structure from SMILES"""
try:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
img = Draw.MolToImage(mol, size=(400, 400))
return img
except Exception as e:
print(f"Error drawing molecule: {e}")
return None
@spaces.GPU
def predict(input_text):
"""Make prediction on the input text directly without creating a dataset"""
if model is None or tokenizer is None:
return "Error: Model not loaded"
# Draw molecule structure
mol_image = draw_molecule(input_text)
model.to('cuda')
# Tokenize input directly
inputs = tokenizer(input_text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
# Move input tensors to GPU
inputs = {k: v.to('cuda') for k, v in inputs.items()}
# Get model predictions
outputs = model(**inputs)
logits = outputs.logits.detach().cpu().numpy()
# Stable softmax to get probabilities
exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
# Get predicted label
pred_label = np.argmax(probs, axis=1)[0]
# Map prediction to label
label_map = {0: "Unnatural", 1: "Natural"}
pred_label_text = label_map[pred_label]
# Format output
result = f"Type: {pred_label_text}\n"
natural_prob = probs[0][1] if pred_label == 1 else 1 - probs[0][0]
result += f"Natural Product Probability: {natural_prob:.4f}\n"
return mol_image, result
# Load model on initialization
load_model("shulik7/NP_SMILES_tokenized_PubChem_shard00_160k")
# Create Gradio interface
demo = gr.Interface(
fn=predict,
inputs=gr.Textbox(lines=5, placeholder="Enter the SMILES here...", label="SMILES Input"),
outputs=[
gr.Image(label="Molecule Structure", type="pil"),
gr.Textbox(lines=5, placeholder="Prediction results will appear here...", label="Prediction")
],
title="Naturalness Prediction",
description="Enter SMILES string to get the prediction from the fine-tuned ChemBERTa model.",
examples=[
["CN1C=NC2=C1C(=O)N(C(=O)N2C)C"], # Caffeine - natural product
["CC(C)NCC(COC1=CC=C(C=C1)COCCOC(C)C)O"], # Atenolol - synthetic
["CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C"] # Retinoic acid - natural product
],
flagging_mode="never"
)
if __name__ == "__main__":
demo.launch() |