Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import time | |
| import tempfile | |
| import pandas as pd | |
| import xgboost as xgb | |
| import gradio as gr | |
| try: | |
| from huggingface_hub import hf_hub_download | |
| except Exception: | |
| hf_hub_download = None | |
| # ----------------------------- | |
| # Config | |
| # ----------------------------- | |
| MODEL_LOCAL_PATH = os.getenv("MODEL_LOCAL_PATH", "model.xgb") | |
| HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "") # e.g. "tlogandesigns/open-pair-model" | |
| HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "model.xgb") | |
| THRESH_DEFAULT = float(os.getenv("THRESH_DEFAULT", "0.50")) | |
| # If you engineer features from text, set a flag and implement text_to_features | |
| USE_TEXT_PIPELINE = os.getenv("USE_TEXT_PIPELINE", "0") == "1" | |
| # ----------------------------- | |
| # Model loading | |
| # ----------------------------- | |
| _model = None | |
| def load_model(): | |
| global _model | |
| if _model is not None: | |
| return _model | |
| path = MODEL_LOCAL_PATH | |
| if HF_MODEL_REPO and hf_hub_download: | |
| try: | |
| path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=HF_MODEL_FILENAME) | |
| except Exception as e: | |
| print(f"Hub download failed: {e}. Falling back to local path: {MODEL_LOCAL_PATH}") | |
| booster = xgb.Booster() | |
| booster.load_model(path) | |
| _model = booster | |
| return _model | |
| # ----------------------------- | |
| # Feature engineering stubs | |
| # ----------------------------- | |
| NUMERIC_COLUMNS = [ | |
| # Replace with your production feature columns in the correct order | |
| # Example placeholders: | |
| "price", "beds", "baths", "sqft", "year_built", | |
| "agent_experience_years", "agent_transactions_12m", "distance_km" | |
| ] | |
| TEXT_INPUTS = [ | |
| # Examples if you do text based features | |
| "listing_description", "agent_bio" | |
| ] | |
| def text_to_features(listing_description: str, agent_bio: str) -> dict: | |
| """ | |
| Replace with your real text feature extraction. | |
| Return a dict where keys match NUMERIC_COLUMNS that depend on text. | |
| """ | |
| # Example toy features based on length | |
| return { | |
| "desc_len": len(listing_description or ""), | |
| "bio_len": len(agent_bio or "") | |
| } | |
| def coerce_and_fill(df: pd.DataFrame) -> pd.DataFrame: | |
| # Ensure all required columns exist. Fill missing with zeros to avoid crashes. | |
| for col in NUMERIC_COLUMNS: | |
| if col not in df.columns: | |
| df[col] = 0 | |
| # Keep only columns in the expected order | |
| df = df[NUMERIC_COLUMNS] | |
| # Numeric coercion | |
| for c in df.columns: | |
| df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0) | |
| return df | |
| # ----------------------------- | |
| # Inference | |
| # ----------------------------- | |
| def score_one( | |
| price, | |
| beds, | |
| baths, | |
| sqft, | |
| year_built, | |
| agent_experience_years, | |
| agent_transactions_12m, | |
| distance_km, | |
| threshold, | |
| listing_description="", | |
| agent_bio="" | |
| ): | |
| t0 = time.time() | |
| booster = load_model() | |
| row = { | |
| "price": price, | |
| "beds": beds, | |
| "baths": baths, | |
| "sqft": sqft, | |
| "year_built": year_built, | |
| "agent_experience_years": agent_experience_years, | |
| "agent_transactions_12m": agent_transactions_12m, | |
| "distance_km": distance_km, | |
| } | |
| if USE_TEXT_PIPELINE: | |
| row.update(text_to_features(listing_description, agent_bio)) | |
| df = pd.DataFrame([row]) | |
| df = coerce_and_fill(df) | |
| dmatrix = xgb.DMatrix(df) | |
| proba = float(booster.predict(dmatrix)[0]) | |
| label = int(proba >= threshold) | |
| latency_ms = int((time.time() - t0) * 1000) | |
| out = { | |
| "score": round(proba, 6), | |
| "label": label, | |
| "threshold": threshold, | |
| "latency_ms": latency_ms, | |
| "inputs": row | |
| } | |
| return json.dumps(out, indent=2) | |
| def score_batch(file, threshold): | |
| """ | |
| Accepts a CSV with columns matching your expected schema. | |
| Extra columns are ignored. Missing are filled with zero. | |
| """ | |
| booster = load_model() | |
| try: | |
| df = pd.read_csv(file.name) | |
| except Exception: | |
| return None, "Could not read CSV" | |
| df_in = df.copy() | |
| if USE_TEXT_PIPELINE: | |
| # If you need to build numeric features from text columns | |
| if set(TEXT_INPUTS).issubset(df_in.columns): | |
| text_feats = df_in.apply( | |
| lambda r: pd.Series(text_to_features( | |
| r.get("listing_description", ""), | |
| r.get("agent_bio", "") | |
| )), | |
| axis=1 | |
| ) | |
| df_in = pd.concat([df_in, text_feats], axis=1) | |
| X = coerce_and_fill(df_in) | |
| dmatrix = xgb.DMatrix(X) | |
| probs = booster.predict(dmatrix) | |
| out = df.copy() | |
| out["score"] = probs | |
| out["label"] = (out["score"] >= threshold).astype(int) | |
| # Save to a temp file for download | |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") | |
| out.to_csv(tmp.name, index=False) | |
| return tmp.name, f"Scored {len(out)} rows" | |
| # ----------------------------- | |
| # UI | |
| # ----------------------------- | |
| with gr.Blocks(title="Open Pair Scorer") as demo: | |
| gr.Markdown("# Open Pair Scorer\nSingle prediction and batch scoring in one place.") | |
| with gr.Tab("Single"): | |
| with gr.Row(): | |
| price = gr.Number(label="price", value=350000, scale=1, min_width=160) | |
| beds = gr.Number(label="beds", value=3, scale=1, min_width=160) | |
| baths = gr.Number(label="baths", value=2, scale=1, min_width=160) | |
| sqft = gr.Number(label="sqft", value=1800, scale=1, min_width=160) | |
| with gr.Row(): | |
| year_built = gr.Number(label="year_built", value=2005, scale=1, min_width=160) | |
| agent_experience_years = gr.Number(label="agent_experience_years", value=5, scale=1, min_width=160) | |
| agent_transactions_12m = gr.Number(label="agent_transactions_12m", value=18, scale=1, min_width=160) | |
| distance_km = gr.Number(label="distance_km", value=4.2, scale=1, min_width=160) | |
| with gr.Row(): | |
| threshold = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01, scale=1, min_width=160) | |
| with gr.Row(visible=USE_TEXT_PIPELINE): | |
| listing_description = gr.Textbox(label="listing_description", lines=6, placeholder="Paste listing description", scale=1) | |
| agent_bio = gr.Textbox(label="agent_bio", lines=6, placeholder="Paste agent bio", scale=1) | |
| btn = gr.Button("Score") | |
| output = gr.Code(label="Result JSON", language="json") | |
| if USE_TEXT_PIPELINE: | |
| btn.click( | |
| score_one, | |
| inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold, listing_description, agent_bio], | |
| outputs=output | |
| ) | |
| else: | |
| btn.click( | |
| score_one, | |
| inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold], | |
| outputs=output | |
| ) | |
| with gr.Tab("Batch CSV"): | |
| gr.Markdown("Upload a CSV. It should contain your model feature columns. Extra columns are fine.") | |
| file_in = gr.File(label="CSV file", file_types=[".csv"]) | |
| threshold_b = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01) | |
| run_b = gr.Button("Score batch") | |
| file_out = gr.File(label="Download scored CSV") | |
| status = gr.Markdown() | |
| run_b.click(score_batch, inputs=[file_in, threshold_b], outputs=[file_out, status]) | |
| gr.Markdown("Tip: set HF_MODEL_REPO and HF_MODEL_FILENAME in Space Secrets to pull your model from the Hub.") | |
| if __name__ == "__main__": | |
| demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860"))) | |