Spaces:
Sleeping
Sleeping
| import os | |
| from pathlib import Path | |
| from typing import Dict, List, Optional | |
| import numpy as np | |
| import pandas as pd | |
| import joblib | |
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from huggingface_hub import snapshot_download | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| MODEL_ID = "sjsoares/6100.Model.Soares" | |
| LOCAL_DIR = Path("model_artifacts") | |
| JOBLIB_EXTS = (".joblib", ".pkl") | |
| CACHED_MODEL = LOCAL_DIR / "model.joblib" # where we will save a trained pipeline if needed | |
| app = FastAPI( | |
| title="EAI6010 — Week 5 (Text Classification)", | |
| description="Robust loader: use joblib if present; else train from repo CSV at runtime.", | |
| version="1.0.0", | |
| ) | |
| class PredictRequest(BaseModel): | |
| text: str | |
| def download_repo(): | |
| token = os.getenv("HUGGINGFACEHUB_API_TOKEN") # set as Space secret if model is private | |
| LOCAL_DIR.mkdir(parents=True, exist_ok=True) | |
| snapshot_download( | |
| repo_id=MODEL_ID, | |
| local_dir=str(LOCAL_DIR), | |
| repo_type="model", | |
| token=token | |
| ) | |
| def list_downloaded() -> List[str]: | |
| files = [] | |
| for p in LOCAL_DIR.rglob("*"): | |
| if p.is_file(): | |
| files.append(str(p.relative_to(LOCAL_DIR))) | |
| return sorted(files) | |
| def find_joblib_artifact() -> Optional[Path]: | |
| for p in LOCAL_DIR.rglob("*"): | |
| if p.is_file() and p.suffix in JOBLIB_EXTS: | |
| return p | |
| return None | |
| def load_csv_for_training() -> pd.DataFrame: | |
| """ | |
| Try reasonable locations/filenames inside the repo for training data. | |
| Expect columns: text,label | |
| """ | |
| candidates = [ | |
| LOCAL_DIR / "data" / "text_data.csv", | |
| LOCAL_DIR / "fallback_text_data.csv", | |
| LOCAL_DIR / "data" / "fallback_text_data.csv", | |
| ] | |
| for c in candidates: | |
| if c.exists(): | |
| df = pd.read_csv(c) | |
| if {"text", "label"}.issubset(df.columns): | |
| print(f"[startup] Found training CSV: {c}") | |
| return df.dropna(subset=["text", "label"]) | |
| else: | |
| print(f"[startup] CSV {c} missing required columns 'text','label'") | |
| raise FileNotFoundError( | |
| f"No suitable CSV found. Looked for: {', '.join(str(c) for c in candidates)} " | |
| "with columns 'text','label'." | |
| ) | |
| def train_runtime_model(df: pd.DataFrame) -> Pipeline: | |
| print("[startup] Training lightweight TF-IDF + LogisticRegression pipeline at runtime...") | |
| X = df["text"].astype(str).values | |
| y = df["label"].astype(str).values | |
| pipe = Pipeline([ | |
| ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=20000)), | |
| ("clf", LogisticRegression(max_iter=1000)) | |
| ]) | |
| pipe.fit(X, y) | |
| LOCAL_DIR.mkdir(parents=True, exist_ok=True) | |
| joblib.dump(pipe, CACHED_MODEL) | |
| print(f"[startup] Saved trained pipeline to {CACHED_MODEL}") | |
| return pipe | |
| def init_model() -> Pipeline: | |
| # 1) If we already cached a trained pipeline, use it. | |
| if CACHED_MODEL.exists(): | |
| print(f"[startup] Loading cached pipeline: {CACHED_MODEL}") | |
| return joblib.load(CACHED_MODEL) | |
| # 2) Download repo and inspect files | |
| download_repo() | |
| files = list_downloaded() | |
| print(f"[startup] Downloaded files from {MODEL_ID}:\n" + "\n".join(files)) | |
| # 3) If a joblib/pkl exists anywhere, load it. | |
| art = find_joblib_artifact() | |
| if art: | |
| print(f"[startup] Loading scikit-learn artifact: {art}") | |
| return joblib.load(art) | |
| # 4) Otherwise, train from a CSV inside the repo and cache | |
| df = load_csv_for_training() | |
| return train_runtime_model(df) | |
| # Load model at startup | |
| PIPE = init_model() | |
| def health(): | |
| return { | |
| "service": "EAI6010 Text Classifier", | |
| "model_id": MODEL_ID, | |
| "artifact": str(CACHED_MODEL) if CACHED_MODEL.exists() else "runtime", | |
| "status": "ok" | |
| } | |
| def predict(req: PredictRequest): | |
| proba = PIPE.predict_proba([req.text])[0] | |
| labels = PIPE.classes_ | |
| top = labels[int(np.argmax(proba))] | |
| scores: Dict[str, float] = {str(labels[i]): float(proba[i]) for i in range(len(labels))} | |
| return {"input": req.text, "label": str(top), "scores": scores} | |