Spaces:

ethnmcl
/

EntrepreneurialReadinessScoreAPI

Sleeping

App Files Files Community

ethnmcl commited on Aug 27, 2025

Commit

c5ddf38

verified ·

1 Parent(s): 334fef3

Update main.py

Browse files

Files changed (1) hide show

main.py +15 -66

main.py CHANGED Viewed

@@ -6,8 +6,7 @@ import typing as T
 from functools import lru_cache
 import pandas as pd
-from fastapi import FastAPI, File, UploadFile, HTTPException
-from fastapi import Body
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
@@ -18,6 +17,7 @@ import numpy as np
 import torch
 from transformers import pipeline
 HF_TOKEN = (
     os.environ.get("HF_TOKEN")
     or os.environ.get("HUGGING_FACE_HUB_TOKEN")
@@ -31,26 +31,22 @@ app = FastAPI(
     title="Entrepreneur Readiness API",
     description=(
         "XGBoost readiness scoring + GPT-2 summarization.\n\n"
-        "Models:\n"
-        f"- {XGB_REPO}\n- {GPT2_REPO}\n"
         "Use /docs for interactive testing."
     ),
-    version="1.0.0",
 )
-# CORS (relaxed so you can call from browsers / Framer, etc.)
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # tighten if needed
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-# -----------------------------
-# Model loading
-# -----------------------------
 def _find_file(dirpath: str, candidates: T.Sequence[str], fallback_exts: T.Sequence[str] = ()) -> str:
     for name in candidates:
         p = os.path.join(dirpath, name)
@@ -61,20 +57,18 @@ def _find_file(dirpath: str, candidates: T.Sequence[str], fallback_exts: T.Seque
             return os.path.join(dirpath, fname)
     raise FileNotFoundError(f"Could not find any of {candidates} (or {fallback_exts}) in {dirpath}")
 @lru_cache(maxsize=1)
 def _download_artifacts() -> T.Tuple[str, str]:
     if HF_TOKEN:
         try:
             login(token=HF_TOKEN, add_to_git_credential=True)
         except Exception:
-            # If public, keep going
             pass
     xgb_local = snapshot_download(repo_id=XGB_REPO, token=HF_TOKEN, revision=None)
     gpt_local = snapshot_download(repo_id=GPT2_REPO, token=HF_TOKEN, revision=None)
     return xgb_local, gpt_local
 @lru_cache(maxsize=1)
 def _load_models():
     xgb_dir, gpt_dir = _download_artifacts()
@@ -108,7 +102,7 @@ def _load_models():
     booster = xgb.Booster()
     booster.load_model(booster_path)
-    # GPT-2 pipeline
     device = 0 if torch.cuda.is_available() else -1
     text_gen = pipeline(
         "text-generation",
@@ -117,13 +111,9 @@ def _load_models():
         device=device,
         trust_remote_code=False,
     )
     return preprocessor, booster, text_gen, xgb_dir
-# -----------------------------
-# Utils
-# -----------------------------
 def _coerce_numeric(df: pd.DataFrame) -> pd.DataFrame:
     out = df.copy()
     for c in out.columns:
@@ -134,18 +124,14 @@ def _coerce_numeric(df: pd.DataFrame) -> pd.DataFrame:
                 pass
     return out
 def _to_dmatrix(df: pd.DataFrame, preprocessor) -> xgb.DMatrix:
     X = preprocessor.transform(df)
     return xgb.DMatrix(X)
 def _predict_scores(df: pd.DataFrame, preprocessor, booster) -> np.ndarray:
     dmat = _to_dmatrix(df, preprocessor)
     scores = booster.predict(dmat)
-    scores = np.array(scores).reshape(-1)
-    return scores
 def _format_prompt(inputs: dict, score: float) -> str:
     kv = "; ".join(f"{k}: {v}" for k, v in inputs.items())
@@ -155,11 +141,9 @@ def _format_prompt(inputs: dict, score: float) -> str:
         "Summary:"
     )
 def _summarize(inputs: dict, score: float, text_gen) -> str:
-    prompt = _format_prompt(inputs, score)
     out = text_gen(
-        prompt,
         max_new_tokens=120,
         do_sample=True,
         temperature=0.7,
@@ -169,47 +153,31 @@ def _summarize(inputs: dict, score: float, text_gen) -> str:
     )[0]["generated_text"]
     return out.split("Summary:", 1)[-1].strip() if "Summary:" in out else out.strip()
-# -----------------------------
-# Schemas
-# -----------------------------
-class RowDict(BaseModel):
-    __root__: dict
 class ScoreRequest(BaseModel):
     rows: T.List[dict] = Field(..., description="List of row objects (feature_name -> value).")
 class ScoreResponse(BaseModel):
     scores: T.List[float]
 class SummarizeRequest(BaseModel):
     inputs: dict = Field(..., description="Feature dict for one example.")
     score: float = Field(..., description="Readiness score used in the summary.")
 class SummarizeResponse(BaseModel):
     summary: str
 class ScoreAndSummarizeRequest(BaseModel):
     rows: T.List[dict] = Field(..., description="Rows to score and summarize.")
 class ScoreAndSummarizeItem(BaseModel):
     score: float
     summary: str
 class ScoreAndSummarizeResponse(BaseModel):
     results: T.List[ScoreAndSummarizeItem]
-# -----------------------------
-# Endpoints
-# -----------------------------
 @app.get("/health")
 def health():
     try:
@@ -218,31 +186,21 @@ def health():
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/score", response_model=ScoreResponse)
 def score_json(req: ScoreRequest = Body(...)):
-    """
-    Score a JSON batch of rows.
-    """
     preprocessor, booster, _, _ = _load_models()
     if not req.rows:
         raise HTTPException(status_code=400, detail="rows must be non-empty")
     df = pd.DataFrame(req.rows)
     df = _coerce_numeric(df)
     try:
         scores = _predict_scores(df, preprocessor, booster)
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Scoring failed: {e}")
     return ScoreResponse(scores=[float(s) for s in scores])
 @app.post("/score_csv", response_model=ScoreResponse)
 async def score_csv(file: UploadFile = File(...)):
-    """
-    Score a CSV upload. Returns the scores list in row order.
-    """
     preprocessor, booster, _, _ = _load_models()
     try:
         content = await file.read()
@@ -253,12 +211,8 @@ async def score_csv(file: UploadFile = File(...)):
         raise HTTPException(status_code=400, detail=f"CSV scoring failed: {e}")
     return ScoreResponse(scores=[float(s) for s in scores])
 @app.post("/summarize", response_model=SummarizeResponse)
 def summarize(req: SummarizeRequest = Body(...)):
-    """
-    Summarize a single example given inputs + score.
-    """
     _, _, text_gen, _ = _load_models()
     try:
         summary = _summarize(req.inputs, req.score, text_gen)
@@ -266,23 +220,17 @@ def summarize(req: SummarizeRequest = Body(...)):
         raise HTTPException(status_code=400, detail=f"Summarization failed: {e}")
     return SummarizeResponse(summary=summary)
 @app.post("/score_and_summarize", response_model=ScoreAndSummarizeResponse)
 def score_and_summarize(req: ScoreAndSummarizeRequest = Body(...)):
-    """
-    For each row: compute score, then generate a GPT-2 summary.
-    """
     preprocessor, booster, text_gen, _ = _load_models()
     if not req.rows:
         raise HTTPException(status_code=400, detail="rows must be non-empty")
     df = pd.DataFrame(req.rows)
     df = _coerce_numeric(df)
     try:
         scores = _predict_scores(df, preprocessor, booster)
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Scoring failed: {e}")
     results = []
     for i, row in enumerate(req.rows):
         try:
@@ -291,3 +239,4 @@ def score_and_summarize(req: ScoreAndSummarizeRequest = Body(...)):
             summ = f"(summary failed: {e})"
         results.append(ScoreAndSummarizeItem(score=float(scores[i]), summary=summ))
     return ScoreAndSummarizeResponse(results=results)

 from functools import lru_cache
 import pandas as pd
+from fastapi import FastAPI, File, UploadFile, HTTPException, Body
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 import torch
 from transformers import pipeline
+# -------- Config --------
 HF_TOKEN = (
     os.environ.get("HF_TOKEN")
     or os.environ.get("HUGGING_FACE_HUB_TOKEN")
     title="Entrepreneur Readiness API",
     description=(
         "XGBoost readiness scoring + GPT-2 summarization.\n\n"
+        f"Models:\n- {XGB_REPO}\n- {GPT2_REPO}\n"
         "Use /docs for interactive testing."
     ),
+    version="1.0.1",
 )
+# CORS (allow all; tighten for production)
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
+# -------- Model loading --------
 def _find_file(dirpath: str, candidates: T.Sequence[str], fallback_exts: T.Sequence[str] = ()) -> str:
     for name in candidates:
         p = os.path.join(dirpath, name)
             return os.path.join(dirpath, fname)
     raise FileNotFoundError(f"Could not find any of {candidates} (or {fallback_exts}) in {dirpath}")
 @lru_cache(maxsize=1)
 def _download_artifacts() -> T.Tuple[str, str]:
     if HF_TOKEN:
         try:
             login(token=HF_TOKEN, add_to_git_credential=True)
         except Exception:
+            # Public models still download
             pass
     xgb_local = snapshot_download(repo_id=XGB_REPO, token=HF_TOKEN, revision=None)
     gpt_local = snapshot_download(repo_id=GPT2_REPO, token=HF_TOKEN, revision=None)
     return xgb_local, gpt_local
 @lru_cache(maxsize=1)
 def _load_models():
     xgb_dir, gpt_dir = _download_artifacts()
     booster = xgb.Booster()
     booster.load_model(booster_path)
+    # GPT-2 text generation
     device = 0 if torch.cuda.is_available() else -1
     text_gen = pipeline(
         "text-generation",
         device=device,
         trust_remote_code=False,
     )
     return preprocessor, booster, text_gen, xgb_dir
+# -------- Utils --------
 def _coerce_numeric(df: pd.DataFrame) -> pd.DataFrame:
     out = df.copy()
     for c in out.columns:
                 pass
     return out
 def _to_dmatrix(df: pd.DataFrame, preprocessor) -> xgb.DMatrix:
     X = preprocessor.transform(df)
     return xgb.DMatrix(X)
 def _predict_scores(df: pd.DataFrame, preprocessor, booster) -> np.ndarray:
     dmat = _to_dmatrix(df, preprocessor)
     scores = booster.predict(dmat)
+    return np.array(scores).reshape(-1)
 def _format_prompt(inputs: dict, score: float) -> str:
     kv = "; ".join(f"{k}: {v}" for k, v in inputs.items())
         "Summary:"
     )
 def _summarize(inputs: dict, score: float, text_gen) -> str:
     out = text_gen(
+        _format_prompt(inputs, score),
         max_new_tokens=120,
         do_sample=True,
         temperature=0.7,
     )[0]["generated_text"]
     return out.split("Summary:", 1)[-1].strip() if "Summary:" in out else out.strip()
+# -------- Schemas (Pydantic v2) --------
 class ScoreRequest(BaseModel):
     rows: T.List[dict] = Field(..., description="List of row objects (feature_name -> value).")
 class ScoreResponse(BaseModel):
     scores: T.List[float]
 class SummarizeRequest(BaseModel):
     inputs: dict = Field(..., description="Feature dict for one example.")
     score: float = Field(..., description="Readiness score used in the summary.")
 class SummarizeResponse(BaseModel):
     summary: str
 class ScoreAndSummarizeRequest(BaseModel):
     rows: T.List[dict] = Field(..., description="Rows to score and summarize.")
 class ScoreAndSummarizeItem(BaseModel):
     score: float
     summary: str
 class ScoreAndSummarizeResponse(BaseModel):
     results: T.List[ScoreAndSummarizeItem]
+# -------- Endpoints --------
 @app.get("/health")
 def health():
     try:
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/score", response_model=ScoreResponse)
 def score_json(req: ScoreRequest = Body(...)):
     preprocessor, booster, _, _ = _load_models()
     if not req.rows:
         raise HTTPException(status_code=400, detail="rows must be non-empty")
     df = pd.DataFrame(req.rows)
     df = _coerce_numeric(df)
     try:
         scores = _predict_scores(df, preprocessor, booster)
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Scoring failed: {e}")
     return ScoreResponse(scores=[float(s) for s in scores])
 @app.post("/score_csv", response_model=ScoreResponse)
 async def score_csv(file: UploadFile = File(...)):
     preprocessor, booster, _, _ = _load_models()
     try:
         content = await file.read()
         raise HTTPException(status_code=400, detail=f"CSV scoring failed: {e}")
     return ScoreResponse(scores=[float(s) for s in scores])
 @app.post("/summarize", response_model=SummarizeResponse)
 def summarize(req: SummarizeRequest = Body(...)):
     _, _, text_gen, _ = _load_models()
     try:
         summary = _summarize(req.inputs, req.score, text_gen)
         raise HTTPException(status_code=400, detail=f"Summarization failed: {e}")
     return SummarizeResponse(summary=summary)
 @app.post("/score_and_summarize", response_model=ScoreAndSummarizeResponse)
 def score_and_summarize(req: ScoreAndSummarizeRequest = Body(...)):
     preprocessor, booster, text_gen, _ = _load_models()
     if not req.rows:
         raise HTTPException(status_code=400, detail="rows must be non-empty")
     df = pd.DataFrame(req.rows)
     df = _coerce_numeric(df)
     try:
         scores = _predict_scores(df, preprocessor, booster)
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Scoring failed: {e}")
     results = []
     for i, row in enumerate(req.rows):
         try:
             summ = f"(summary failed: {e})"
         results.append(ScoreAndSummarizeItem(score=float(scores[i]), summary=summ))
     return ScoreAndSummarizeResponse(results=results)