import os import json import time import tempfile import pandas as pd import xgboost as xgb import gradio as gr try: from huggingface_hub import hf_hub_download except Exception: hf_hub_download = None # ----------------------------- # Config # ----------------------------- MODEL_LOCAL_PATH = os.getenv("MODEL_LOCAL_PATH", "model.xgb") HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "") # e.g. "tlogandesigns/open-pair-model" HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "model.xgb") THRESH_DEFAULT = float(os.getenv("THRESH_DEFAULT", "0.50")) # If you engineer features from text, set a flag and implement text_to_features USE_TEXT_PIPELINE = os.getenv("USE_TEXT_PIPELINE", "0") == "1" # ----------------------------- # Model loading # ----------------------------- _model = None def load_model(): global _model if _model is not None: return _model path = MODEL_LOCAL_PATH if HF_MODEL_REPO and hf_hub_download: try: path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=HF_MODEL_FILENAME) except Exception as e: print(f"Hub download failed: {e}. Falling back to local path: {MODEL_LOCAL_PATH}") booster = xgb.Booster() booster.load_model(path) _model = booster return _model # ----------------------------- # Feature engineering stubs # ----------------------------- NUMERIC_COLUMNS = [ # Replace with your production feature columns in the correct order # Example placeholders: "price", "beds", "baths", "sqft", "year_built", "agent_experience_years", "agent_transactions_12m", "distance_km" ] TEXT_INPUTS = [ # Examples if you do text based features "listing_description", "agent_bio" ] def text_to_features(listing_description: str, agent_bio: str) -> dict: """ Replace with your real text feature extraction. Return a dict where keys match NUMERIC_COLUMNS that depend on text. """ # Example toy features based on length return { "desc_len": len(listing_description or ""), "bio_len": len(agent_bio or "") } def coerce_and_fill(df: pd.DataFrame) -> pd.DataFrame: # Ensure all required columns exist. Fill missing with zeros to avoid crashes. for col in NUMERIC_COLUMNS: if col not in df.columns: df[col] = 0 # Keep only columns in the expected order df = df[NUMERIC_COLUMNS] # Numeric coercion for c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0) return df # ----------------------------- # Inference # ----------------------------- def score_one( price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold, listing_description="", agent_bio="" ): t0 = time.time() booster = load_model() row = { "price": price, "beds": beds, "baths": baths, "sqft": sqft, "year_built": year_built, "agent_experience_years": agent_experience_years, "agent_transactions_12m": agent_transactions_12m, "distance_km": distance_km, } if USE_TEXT_PIPELINE: row.update(text_to_features(listing_description, agent_bio)) df = pd.DataFrame([row]) df = coerce_and_fill(df) dmatrix = xgb.DMatrix(df) proba = float(booster.predict(dmatrix)[0]) label = int(proba >= threshold) latency_ms = int((time.time() - t0) * 1000) out = { "score": round(proba, 6), "label": label, "threshold": threshold, "latency_ms": latency_ms, "inputs": row } return json.dumps(out, indent=2) def score_batch(file, threshold): """ Accepts a CSV with columns matching your expected schema. Extra columns are ignored. Missing are filled with zero. """ booster = load_model() try: df = pd.read_csv(file.name) except Exception: return None, "Could not read CSV" df_in = df.copy() if USE_TEXT_PIPELINE: # If you need to build numeric features from text columns if set(TEXT_INPUTS).issubset(df_in.columns): text_feats = df_in.apply( lambda r: pd.Series(text_to_features( r.get("listing_description", ""), r.get("agent_bio", "") )), axis=1 ) df_in = pd.concat([df_in, text_feats], axis=1) X = coerce_and_fill(df_in) dmatrix = xgb.DMatrix(X) probs = booster.predict(dmatrix) out = df.copy() out["score"] = probs out["label"] = (out["score"] >= threshold).astype(int) # Save to a temp file for download tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") out.to_csv(tmp.name, index=False) return tmp.name, f"Scored {len(out)} rows" # ----------------------------- # UI # ----------------------------- with gr.Blocks(title="Open Pair Scorer") as demo: gr.Markdown("# Open Pair Scorer\nSingle prediction and batch scoring in one place.") with gr.Tab("Single"): with gr.Row(): price = gr.Number(label="price", value=350000, scale=1, min_width=160) beds = gr.Number(label="beds", value=3, scale=1, min_width=160) baths = gr.Number(label="baths", value=2, scale=1, min_width=160) sqft = gr.Number(label="sqft", value=1800, scale=1, min_width=160) with gr.Row(): year_built = gr.Number(label="year_built", value=2005, scale=1, min_width=160) agent_experience_years = gr.Number(label="agent_experience_years", value=5, scale=1, min_width=160) agent_transactions_12m = gr.Number(label="agent_transactions_12m", value=18, scale=1, min_width=160) distance_km = gr.Number(label="distance_km", value=4.2, scale=1, min_width=160) with gr.Row(): threshold = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01, scale=1, min_width=160) with gr.Row(visible=USE_TEXT_PIPELINE): listing_description = gr.Textbox(label="listing_description", lines=6, placeholder="Paste listing description", scale=1) agent_bio = gr.Textbox(label="agent_bio", lines=6, placeholder="Paste agent bio", scale=1) btn = gr.Button("Score") output = gr.Code(label="Result JSON", language="json") if USE_TEXT_PIPELINE: btn.click( score_one, inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold, listing_description, agent_bio], outputs=output ) else: btn.click( score_one, inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold], outputs=output ) with gr.Tab("Batch CSV"): gr.Markdown("Upload a CSV. It should contain your model feature columns. Extra columns are fine.") file_in = gr.File(label="CSV file", file_types=[".csv"]) threshold_b = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01) run_b = gr.Button("Score batch") file_out = gr.File(label="Download scored CSV") status = gr.Markdown() run_b.click(score_batch, inputs=[file_in, threshold_b], outputs=[file_out, status]) gr.Markdown("Tip: set HF_MODEL_REPO and HF_MODEL_FILENAME in Space Secrets to pull your model from the Hub.") if __name__ == "__main__": demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))