sjsoares's picture
Update app.py
290d189 verified
import os
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
import pandas as pd
import joblib
from fastapi import FastAPI
from pydantic import BaseModel
from huggingface_hub import snapshot_download
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
MODEL_ID = "sjsoares/6100.Model.Soares"
LOCAL_DIR = Path("model_artifacts")
JOBLIB_EXTS = (".joblib", ".pkl")
CACHED_MODEL = LOCAL_DIR / "model.joblib" # where we will save a trained pipeline if needed
app = FastAPI(
title="EAI6010 — Week 5 (Text Classification)",
description="Robust loader: use joblib if present; else train from repo CSV at runtime.",
version="1.0.0",
)
class PredictRequest(BaseModel):
text: str
def download_repo():
token = os.getenv("HUGGINGFACEHUB_API_TOKEN") # set as Space secret if model is private
LOCAL_DIR.mkdir(parents=True, exist_ok=True)
snapshot_download(
repo_id=MODEL_ID,
local_dir=str(LOCAL_DIR),
repo_type="model",
token=token
)
def list_downloaded() -> List[str]:
files = []
for p in LOCAL_DIR.rglob("*"):
if p.is_file():
files.append(str(p.relative_to(LOCAL_DIR)))
return sorted(files)
def find_joblib_artifact() -> Optional[Path]:
for p in LOCAL_DIR.rglob("*"):
if p.is_file() and p.suffix in JOBLIB_EXTS:
return p
return None
def load_csv_for_training() -> pd.DataFrame:
"""
Try reasonable locations/filenames inside the repo for training data.
Expect columns: text,label
"""
candidates = [
LOCAL_DIR / "data" / "text_data.csv",
LOCAL_DIR / "fallback_text_data.csv",
LOCAL_DIR / "data" / "fallback_text_data.csv",
]
for c in candidates:
if c.exists():
df = pd.read_csv(c)
if {"text", "label"}.issubset(df.columns):
print(f"[startup] Found training CSV: {c}")
return df.dropna(subset=["text", "label"])
else:
print(f"[startup] CSV {c} missing required columns 'text','label'")
raise FileNotFoundError(
f"No suitable CSV found. Looked for: {', '.join(str(c) for c in candidates)} "
"with columns 'text','label'."
)
def train_runtime_model(df: pd.DataFrame) -> Pipeline:
print("[startup] Training lightweight TF-IDF + LogisticRegression pipeline at runtime...")
X = df["text"].astype(str).values
y = df["label"].astype(str).values
pipe = Pipeline([
("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=20000)),
("clf", LogisticRegression(max_iter=1000))
])
pipe.fit(X, y)
LOCAL_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(pipe, CACHED_MODEL)
print(f"[startup] Saved trained pipeline to {CACHED_MODEL}")
return pipe
def init_model() -> Pipeline:
# 1) If we already cached a trained pipeline, use it.
if CACHED_MODEL.exists():
print(f"[startup] Loading cached pipeline: {CACHED_MODEL}")
return joblib.load(CACHED_MODEL)
# 2) Download repo and inspect files
download_repo()
files = list_downloaded()
print(f"[startup] Downloaded files from {MODEL_ID}:\n" + "\n".join(files))
# 3) If a joblib/pkl exists anywhere, load it.
art = find_joblib_artifact()
if art:
print(f"[startup] Loading scikit-learn artifact: {art}")
return joblib.load(art)
# 4) Otherwise, train from a CSV inside the repo and cache
df = load_csv_for_training()
return train_runtime_model(df)
# Load model at startup
PIPE = init_model()
@app.get("/")
def health():
return {
"service": "EAI6010 Text Classifier",
"model_id": MODEL_ID,
"artifact": str(CACHED_MODEL) if CACHED_MODEL.exists() else "runtime",
"status": "ok"
}
@app.post("/predict")
def predict(req: PredictRequest):
proba = PIPE.predict_proba([req.text])[0]
labels = PIPE.classes_
top = labels[int(np.argmax(proba))]
scores: Dict[str, float] = {str(labels[i]): float(proba[i]) for i in range(len(labels))}
return {"input": req.text, "label": str(top), "scores": scores}