Spaces:

tlogandesigns
/

open-house-pairing

Sleeping

File size: 7,728 Bytes

import os
import json
import time
import tempfile
import pandas as pd
import xgboost as xgb
import gradio as gr

try:
    from huggingface_hub import hf_hub_download
except Exception:
    hf_hub_download = None

# -----------------------------
# Config
# -----------------------------
MODEL_LOCAL_PATH = os.getenv("MODEL_LOCAL_PATH", "model.xgb")
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "")          # e.g. "tlogandesigns/open-pair-model"
HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "model.xgb")
THRESH_DEFAULT = float(os.getenv("THRESH_DEFAULT", "0.50"))

# If you engineer features from text, set a flag and implement text_to_features
USE_TEXT_PIPELINE = os.getenv("USE_TEXT_PIPELINE", "0") == "1"

# -----------------------------
# Model loading
# -----------------------------
_model = None

def load_model():
    global _model
    if _model is not None:
        return _model

    path = MODEL_LOCAL_PATH
    if HF_MODEL_REPO and hf_hub_download:
        try:
            path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=HF_MODEL_FILENAME)
        except Exception as e:
            print(f"Hub download failed: {e}. Falling back to local path: {MODEL_LOCAL_PATH}")

    booster = xgb.Booster()
    booster.load_model(path)
    _model = booster
    return _model

# -----------------------------
# Feature engineering stubs
# -----------------------------
NUMERIC_COLUMNS = [
    # Replace with your production feature columns in the correct order
    # Example placeholders:
    "price", "beds", "baths", "sqft", "year_built",
    "agent_experience_years", "agent_transactions_12m", "distance_km"
]

TEXT_INPUTS = [
    # Examples if you do text based features
    "listing_description", "agent_bio"
]

def text_to_features(listing_description: str, agent_bio: str) -> dict:
    """
    Replace with your real text feature extraction.
    Return a dict where keys match NUMERIC_COLUMNS that depend on text.
    """
    # Example toy features based on length
    return {
        "desc_len": len(listing_description or ""),
        "bio_len": len(agent_bio or "")
    }

def coerce_and_fill(df: pd.DataFrame) -> pd.DataFrame:
    # Ensure all required columns exist. Fill missing with zeros to avoid crashes.
    for col in NUMERIC_COLUMNS:
        if col not in df.columns:
            df[col] = 0
    # Keep only columns in the expected order
    df = df[NUMERIC_COLUMNS]
    # Numeric coercion
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
    return df

# -----------------------------
# Inference
# -----------------------------
def score_one(
    price,
    beds,
    baths,
    sqft,
    year_built,
    agent_experience_years,
    agent_transactions_12m,
    distance_km,
    threshold,
    listing_description="",
    agent_bio=""
):
    t0 = time.time()
    booster = load_model()

    row = {
        "price": price,
        "beds": beds,
        "baths": baths,
        "sqft": sqft,
        "year_built": year_built,
        "agent_experience_years": agent_experience_years,
        "agent_transactions_12m": agent_transactions_12m,
        "distance_km": distance_km,
    }

    if USE_TEXT_PIPELINE:
        row.update(text_to_features(listing_description, agent_bio))

    df = pd.DataFrame([row])
    df = coerce_and_fill(df)
    dmatrix = xgb.DMatrix(df)

    proba = float(booster.predict(dmatrix)[0])
    label = int(proba >= threshold)
    latency_ms = int((time.time() - t0) * 1000)

    out = {
        "score": round(proba, 6),
        "label": label,
        "threshold": threshold,
        "latency_ms": latency_ms,
        "inputs": row
    }
    return json.dumps(out, indent=2)

def score_batch(file, threshold):
    """
    Accepts a CSV with columns matching your expected schema.
    Extra columns are ignored. Missing are filled with zero.
    """
    booster = load_model()
    try:
        df = pd.read_csv(file.name)
    except Exception:
        return None, "Could not read CSV"

    df_in = df.copy()
    if USE_TEXT_PIPELINE:
        # If you need to build numeric features from text columns
        if set(TEXT_INPUTS).issubset(df_in.columns):
            text_feats = df_in.apply(
                lambda r: pd.Series(text_to_features(
                    r.get("listing_description", ""),
                    r.get("agent_bio", "")
                )),
                axis=1
            )
            df_in = pd.concat([df_in, text_feats], axis=1)

    X = coerce_and_fill(df_in)
    dmatrix = xgb.DMatrix(X)
    probs = booster.predict(dmatrix)

    out = df.copy()
    out["score"] = probs
    out["label"] = (out["score"] >= threshold).astype(int)

    # Save to a temp file for download
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    out.to_csv(tmp.name, index=False)
    return tmp.name, f"Scored {len(out)} rows"

# -----------------------------
# UI
# -----------------------------
with gr.Blocks(title="Open Pair Scorer") as demo:
    gr.Markdown("# Open Pair Scorer\nSingle prediction and batch scoring in one place.")

    with gr.Tab("Single"):
        with gr.Row():
            price = gr.Number(label="price", value=350000, scale=1, min_width=160)
            beds = gr.Number(label="beds", value=3, scale=1, min_width=160)
            baths = gr.Number(label="baths", value=2, scale=1, min_width=160)
            sqft = gr.Number(label="sqft", value=1800, scale=1, min_width=160)
        with gr.Row():
            year_built = gr.Number(label="year_built", value=2005, scale=1, min_width=160)
            agent_experience_years = gr.Number(label="agent_experience_years", value=5, scale=1, min_width=160)
            agent_transactions_12m = gr.Number(label="agent_transactions_12m", value=18, scale=1, min_width=160)
            distance_km = gr.Number(label="distance_km", value=4.2, scale=1, min_width=160)
        with gr.Row():
            threshold = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01, scale=1, min_width=160)
        with gr.Row(visible=USE_TEXT_PIPELINE):
            listing_description = gr.Textbox(label="listing_description", lines=6, placeholder="Paste listing description", scale=1)
            agent_bio = gr.Textbox(label="agent_bio", lines=6, placeholder="Paste agent bio", scale=1)

        btn = gr.Button("Score")
        output = gr.Code(label="Result JSON", language="json")
        if USE_TEXT_PIPELINE:
            btn.click(
                score_one,
                inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold, listing_description, agent_bio],
                outputs=output
            )
        else:
            btn.click(
                score_one,
                inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold],
                outputs=output
            )

    with gr.Tab("Batch CSV"):
        gr.Markdown("Upload a CSV. It should contain your model feature columns. Extra columns are fine.")
        file_in = gr.File(label="CSV file", file_types=[".csv"])
        threshold_b = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01)
        run_b = gr.Button("Score batch")
        file_out = gr.File(label="Download scored CSV")
        status = gr.Markdown()
        run_b.click(score_batch, inputs=[file_in, threshold_b], outputs=[file_out, status])

    gr.Markdown("Tip: set HF_MODEL_REPO and HF_MODEL_FILENAME in Space Secrets to pull your model from the Hub.")

if __name__ == "__main__":
    demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))