File size: 2,033 Bytes
6353768
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
import joblib
import numpy as np

from rdkit import Chem
from rdkit.Chem import Descriptors

from src.serving.similarity import SimilarityIndex

app = FastAPI(title="ChemiMLOps API", version="0.1.0")

MODEL_PATH = "models/lipophilicity_rf.joblib"
DATA_PATH = "data/processed/lipophilicity_clean.csv"

model = joblib.load(MODEL_PATH)
SIM_INDEX: SimilarityIndex | None = None


class PredictRequest(BaseModel):
    smiles: str = Field(..., example="CCO")  # ethanol example


def rdkit_features(smiles: str) -> np.ndarray | None:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    feats = [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.TPSA(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.RingCount(mol),
    ]
    return np.array(feats, dtype=float).reshape(1, -1)


@app.on_event("startup")
def load_similarity_index():
    """Load similarity search index once when the API starts."""
    global SIM_INDEX
    SIM_INDEX = SimilarityIndex(DATA_PATH)


@app.get("/health")
def health():
    return {"status": "ok"}


@app.post("/predict/lipophilicity")
def predict(req: PredictRequest):
    X = rdkit_features(req.smiles)
    if X is None:
        raise HTTPException(
            status_code=400,
            detail="Invalid SMILES. Example: 'CCO' or 'c1ccccc1'",
        )

    pred = float(model.predict(X)[0])
    return {"smiles": req.smiles, "lipophilicity_pred": pred}


@app.get("/similarity/topk")
def similarity_topk(smiles: str, k: int = 5):
    if SIM_INDEX is None:
        raise HTTPException(status_code=503, detail="Similarity index not loaded yet")

    results = SIM_INDEX.topk(smiles, k=k)
    if results is None:
        raise HTTPException(status_code=400, detail="Invalid SMILES for similarity search")

    return {"query": smiles, "k": k, "results": results}