Spaces:

sjsoares
/

6100.Model.Soares.Demo

Sleeping

App Files Files Community

6100.Model.Soares.Demo / app.py

sjsoares

Update app.py

290d189 verified 6 months ago

raw

history blame contribute delete

4.24 kB

	import os
	from pathlib import Path
	from typing import Dict, List, Optional

	import numpy as np
	import pandas as pd
	import joblib
	from fastapi import FastAPI
	from pydantic import BaseModel
	from huggingface_hub import snapshot_download
	from sklearn.pipeline import Pipeline
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression

	MODEL_ID = "sjsoares/6100.Model.Soares"
	LOCAL_DIR = Path("model_artifacts")
	JOBLIB_EXTS = (".joblib", ".pkl")
	CACHED_MODEL = LOCAL_DIR / "model.joblib" # where we will save a trained pipeline if needed

	app = FastAPI(
	title="EAI6010 — Week 5 (Text Classification)",
	description="Robust loader: use joblib if present; else train from repo CSV at runtime.",
	version="1.0.0",
	)

	class PredictRequest(BaseModel):
	text: str

	def download_repo():
	token = os.getenv("HUGGINGFACEHUB_API_TOKEN") # set as Space secret if model is private
	LOCAL_DIR.mkdir(parents=True, exist_ok=True)
	snapshot_download(
	repo_id=MODEL_ID,
	local_dir=str(LOCAL_DIR),
	repo_type="model",
	token=token
	)

	def list_downloaded() -> List[str]:
	files = []
	for p in LOCAL_DIR.rglob("*"):
	if p.is_file():
	files.append(str(p.relative_to(LOCAL_DIR)))
	return sorted(files)

	def find_joblib_artifact() -> Optional[Path]:
	for p in LOCAL_DIR.rglob("*"):
	if p.is_file() and p.suffix in JOBLIB_EXTS:
	return p
	return None

	def load_csv_for_training() -> pd.DataFrame:
	"""
	Try reasonable locations/filenames inside the repo for training data.
	Expect columns: text,label
	"""
	candidates = [
	LOCAL_DIR / "data" / "text_data.csv",
	LOCAL_DIR / "fallback_text_data.csv",
	LOCAL_DIR / "data" / "fallback_text_data.csv",
	]
	for c in candidates:
	if c.exists():
	df = pd.read_csv(c)
	if {"text", "label"}.issubset(df.columns):
	print(f"[startup] Found training CSV: {c}")
	return df.dropna(subset=["text", "label"])
	else:
	print(f"[startup] CSV {c} missing required columns 'text','label'")
	raise FileNotFoundError(
	f"No suitable CSV found. Looked for: {', '.join(str(c) for c in candidates)} "
	"with columns 'text','label'."
	)

	def train_runtime_model(df: pd.DataFrame) -> Pipeline:
	print("[startup] Training lightweight TF-IDF + LogisticRegression pipeline at runtime...")
	X = df["text"].astype(str).values
	y = df["label"].astype(str).values
	pipe = Pipeline([
	("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=20000)),
	("clf", LogisticRegression(max_iter=1000))
	])
	pipe.fit(X, y)
	LOCAL_DIR.mkdir(parents=True, exist_ok=True)
	joblib.dump(pipe, CACHED_MODEL)
	print(f"[startup] Saved trained pipeline to {CACHED_MODEL}")
	return pipe

	def init_model() -> Pipeline:
	# 1) If we already cached a trained pipeline, use it.
	if CACHED_MODEL.exists():
	print(f"[startup] Loading cached pipeline: {CACHED_MODEL}")
	return joblib.load(CACHED_MODEL)

	# 2) Download repo and inspect files
	download_repo()
	files = list_downloaded()
	print(f"[startup] Downloaded files from {MODEL_ID}:\n" + "\n".join(files))

	# 3) If a joblib/pkl exists anywhere, load it.
	art = find_joblib_artifact()
	if art:
	print(f"[startup] Loading scikit-learn artifact: {art}")
	return joblib.load(art)

	# 4) Otherwise, train from a CSV inside the repo and cache
	df = load_csv_for_training()
	return train_runtime_model(df)

	# Load model at startup
	PIPE = init_model()

	@app.get("/")
	def health():
	return {
	"service": "EAI6010 Text Classifier",
	"model_id": MODEL_ID,
	"artifact": str(CACHED_MODEL) if CACHED_MODEL.exists() else "runtime",
	"status": "ok"
	}

	@app.post("/predict")
	def predict(req: PredictRequest):
	proba = PIPE.predict_proba([req.text])[0]
	labels = PIPE.classes_
	top = labels[int(np.argmax(proba))]
	scores: Dict[str, float] = {str(labels[i]): float(proba[i]) for i in range(len(labels))}
	return {"input": req.text, "label": str(top), "scores": scores}