Spaces:

ethnmcl
/

EntrepreneurialReadinessScoreAPI

Sleeping

App Files Files Community

ethnmcl commited on Aug 27, 2025

Commit

e3a7252

verified ·

1 Parent(s): f918bcb

Create main.py

Browse files

Files changed (1) hide show

main.py +293 -0

main.py ADDED Viewed

	@@ -0,0 +1,293 @@

+# main.py
+import os
+import io
+import json
+import typing as T
+from functools import lru_cache
+import pandas as pd
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi import Body
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from huggingface_hub import login, snapshot_download
+import joblib
+import xgboost as xgb
+import numpy as np
+import torch
+from transformers import pipeline
+HF_TOKEN = (
+    os.environ.get("HF_TOKEN")
+    or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+    or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
+)
+XGB_REPO = "ethnmcl/entrepreneur-readiness-xgb"
+GPT2_REPO = "ethnmcl/gpt2-entrepreneur-agent"
+app = FastAPI(
+    title="Entrepreneur Readiness API",
+    description=(
+        "XGBoost readiness scoring + GPT-2 summarization.\n\n"
+        "Models:\n"
+        f"- {XGB_REPO}\n- {GPT2_REPO}\n"
+        "Use /docs for interactive testing."
+    ),
+    version="1.0.0",
+)
+# CORS (relaxed so you can call from browsers / Framer, etc.)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # tighten if needed
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# -----------------------------
+# Model loading
+# -----------------------------
+def _find_file(dirpath: str, candidates: T.Sequence[str], fallback_exts: T.Sequence[str] = ()) -> str:
+    for name in candidates:
+        p = os.path.join(dirpath, name)
+        if os.path.exists(p):
+            return p
+    for fname in os.listdir(dirpath):
+        if any(fname.endswith(ext) for ext in fallback_exts):
+            return os.path.join(dirpath, fname)
+    raise FileNotFoundError(f"Could not find any of {candidates} (or {fallback_exts}) in {dirpath}")
+@lru_cache(maxsize=1)
+def _download_artifacts() -> T.Tuple[str, str]:
+    if HF_TOKEN:
+        try:
+            login(token=HF_TOKEN, add_to_git_credential=True)
+        except Exception:
+            # If public, keep going
+            pass
+    xgb_local = snapshot_download(repo_id=XGB_REPO, token=HF_TOKEN, revision=None)
+    gpt_local = snapshot_download(repo_id=GPT2_REPO, token=HF_TOKEN, revision=None)
+    return xgb_local, gpt_local
+@lru_cache(maxsize=1)
+def _load_models():
+    xgb_dir, gpt_dir = _download_artifacts()
+    # Preprocessor
+    preproc_path = _find_file(
+        xgb_dir,
+        candidates=[
+            "readiness_preprocessor.joblib",
+            "preprocessor.joblib",
+            "preprocessor.pkl",
+            "readiness_preprocessor.pkl",
+        ],
+        fallback_exts=(".joblib", ".pkl"),
+    )
+    preprocessor = joblib.load(preproc_path)
+    # Booster
+    booster_path = _find_file(
+        xgb_dir,
+        candidates=[
+            "xgb_readiness_model.json",
+            "xgb_model.json",
+            "model.json",
+            "model.ubj",
+            "model.bin",
+            "readiness_xgb.json",
+        ],
+        fallback_exts=(".json", ".ubj", ".bin"),
+    )
+    booster = xgb.Booster()
+    booster.load_model(booster_path)
+    # GPT-2 pipeline
+    device = 0 if torch.cuda.is_available() else -1
+    text_gen = pipeline(
+        "text-generation",
+        model=gpt_dir,
+        tokenizer=gpt_dir,
+        device=device,
+        trust_remote_code=False,
+    )
+    return preprocessor, booster, text_gen, xgb_dir
+# -----------------------------
+# Utils
+# -----------------------------
+def _coerce_numeric(df: pd.DataFrame) -> pd.DataFrame:
+    out = df.copy()
+    for c in out.columns:
+        if out[c].dtype == object:
+            try:
+                out[c] = pd.to_numeric(out[c])
+            except Exception:
+                pass
+    return out
+def _to_dmatrix(df: pd.DataFrame, preprocessor) -> xgb.DMatrix:
+    X = preprocessor.transform(df)
+    return xgb.DMatrix(X)
+def _predict_scores(df: pd.DataFrame, preprocessor, booster) -> np.ndarray:
+    dmat = _to_dmatrix(df, preprocessor)
+    scores = booster.predict(dmat)
+    scores = np.array(scores).reshape(-1)
+    return scores
+def _format_prompt(inputs: dict, score: float) -> str:
+    kv = "; ".join(f"{k}: {v}" for k, v in inputs.items())
+    return (
+        "Summarize the entrepreneur readiness profile succinctly.\n"
+        f"Inputs -> {kv}; Score -> {score:.3f}\n"
+        "Summary:"
+    )
+def _summarize(inputs: dict, score: float, text_gen) -> str:
+    prompt = _format_prompt(inputs, score)
+    out = text_gen(
+        prompt,
+        max_new_tokens=120,
+        do_sample=True,
+        temperature=0.7,
+        top_p=0.9,
+        num_return_sequences=1,
+        eos_token_id=None,
+    )[0]["generated_text"]
+    return out.split("Summary:", 1)[-1].strip() if "Summary:" in out else out.strip()
+# -----------------------------
+# Schemas
+# -----------------------------
+class RowDict(BaseModel):
+    __root__: dict
+class ScoreRequest(BaseModel):
+    rows: T.List[dict] = Field(..., description="List of row objects (feature_name -> value).")
+class ScoreResponse(BaseModel):
+    scores: T.List[float]
+class SummarizeRequest(BaseModel):
+    inputs: dict = Field(..., description="Feature dict for one example.")
+    score: float = Field(..., description="Readiness score used in the summary.")
+class SummarizeResponse(BaseModel):
+    summary: str
+class ScoreAndSummarizeRequest(BaseModel):
+    rows: T.List[dict] = Field(..., description="Rows to score and summarize.")
+class ScoreAndSummarizeItem(BaseModel):
+    score: float
+    summary: str
+class ScoreAndSummarizeResponse(BaseModel):
+    results: T.List[ScoreAndSummarizeItem]
+# -----------------------------
+# Endpoints
+# -----------------------------
+@app.get("/health")
+def health():
+    try:
+        _load_models()
+        return {"ok": True}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/score", response_model=ScoreResponse)
+def score_json(req: ScoreRequest = Body(...)):
+    """
+    Score a JSON batch of rows.
+    """
+    preprocessor, booster, _, _ = _load_models()
+    if not req.rows:
+        raise HTTPException(status_code=400, detail="rows must be non-empty")
+    df = pd.DataFrame(req.rows)
+    df = _coerce_numeric(df)
+    try:
+        scores = _predict_scores(df, preprocessor, booster)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Scoring failed: {e}")
+    return ScoreResponse(scores=[float(s) for s in scores])
+@app.post("/score_csv", response_model=ScoreResponse)
+async def score_csv(file: UploadFile = File(...)):
+    """
+    Score a CSV upload. Returns the scores list in row order.
+    """
+    preprocessor, booster, _, _ = _load_models()
+    try:
+        content = await file.read()
+        df = pd.read_csv(io.BytesIO(content))
+        df = _coerce_numeric(df)
+        scores = _predict_scores(df, preprocessor, booster)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"CSV scoring failed: {e}")
+    return ScoreResponse(scores=[float(s) for s in scores])
+@app.post("/summarize", response_model=SummarizeResponse)
+def summarize(req: SummarizeRequest = Body(...)):
+    """
+    Summarize a single example given inputs + score.
+    """
+    _, _, text_gen, _ = _load_models()
+    try:
+        summary = _summarize(req.inputs, req.score, text_gen)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Summarization failed: {e}")
+    return SummarizeResponse(summary=summary)
+@app.post("/score_and_summarize", response_model=ScoreAndSummarizeResponse)
+def score_and_summarize(req: ScoreAndSummarizeRequest = Body(...)):
+    """
+    For each row: compute score, then generate a GPT-2 summary.
+    """
+    preprocessor, booster, text_gen, _ = _load_models()
+    if not req.rows:
+        raise HTTPException(status_code=400, detail="rows must be non-empty")
+    df = pd.DataFrame(req.rows)
+    df = _coerce_numeric(df)
+    try:
+        scores = _predict_scores(df, preprocessor, booster)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Scoring failed: {e}")
+    results = []
+    for i, row in enumerate(req.rows):
+        try:
+            summ = _summarize(row, float(scores[i]), text_gen)
+        except Exception as e:
+            summ = f"(summary failed: {e})"
+        results.append(ScoreAndSummarizeItem(score=float(scores[i]), summary=summ))
+    return ScoreAndSummarizeResponse(results=results)