tlogandesigns's picture
ui updates
df1616e
import os
import json
import time
import tempfile
import pandas as pd
import xgboost as xgb
import gradio as gr
try:
from huggingface_hub import hf_hub_download
except Exception:
hf_hub_download = None
# -----------------------------
# Config
# -----------------------------
MODEL_LOCAL_PATH = os.getenv("MODEL_LOCAL_PATH", "model.xgb")
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "") # e.g. "tlogandesigns/open-pair-model"
HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "model.xgb")
THRESH_DEFAULT = float(os.getenv("THRESH_DEFAULT", "0.50"))
# If you engineer features from text, set a flag and implement text_to_features
USE_TEXT_PIPELINE = os.getenv("USE_TEXT_PIPELINE", "0") == "1"
# -----------------------------
# Model loading
# -----------------------------
_model = None
def load_model():
global _model
if _model is not None:
return _model
path = MODEL_LOCAL_PATH
if HF_MODEL_REPO and hf_hub_download:
try:
path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=HF_MODEL_FILENAME)
except Exception as e:
print(f"Hub download failed: {e}. Falling back to local path: {MODEL_LOCAL_PATH}")
booster = xgb.Booster()
booster.load_model(path)
_model = booster
return _model
# -----------------------------
# Feature engineering stubs
# -----------------------------
NUMERIC_COLUMNS = [
# Replace with your production feature columns in the correct order
# Example placeholders:
"price", "beds", "baths", "sqft", "year_built",
"agent_experience_years", "agent_transactions_12m", "distance_km"
]
TEXT_INPUTS = [
# Examples if you do text based features
"listing_description", "agent_bio"
]
def text_to_features(listing_description: str, agent_bio: str) -> dict:
"""
Replace with your real text feature extraction.
Return a dict where keys match NUMERIC_COLUMNS that depend on text.
"""
# Example toy features based on length
return {
"desc_len": len(listing_description or ""),
"bio_len": len(agent_bio or "")
}
def coerce_and_fill(df: pd.DataFrame) -> pd.DataFrame:
# Ensure all required columns exist. Fill missing with zeros to avoid crashes.
for col in NUMERIC_COLUMNS:
if col not in df.columns:
df[col] = 0
# Keep only columns in the expected order
df = df[NUMERIC_COLUMNS]
# Numeric coercion
for c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
return df
# -----------------------------
# Inference
# -----------------------------
def score_one(
price,
beds,
baths,
sqft,
year_built,
agent_experience_years,
agent_transactions_12m,
distance_km,
threshold,
listing_description="",
agent_bio=""
):
t0 = time.time()
booster = load_model()
row = {
"price": price,
"beds": beds,
"baths": baths,
"sqft": sqft,
"year_built": year_built,
"agent_experience_years": agent_experience_years,
"agent_transactions_12m": agent_transactions_12m,
"distance_km": distance_km,
}
if USE_TEXT_PIPELINE:
row.update(text_to_features(listing_description, agent_bio))
df = pd.DataFrame([row])
df = coerce_and_fill(df)
dmatrix = xgb.DMatrix(df)
proba = float(booster.predict(dmatrix)[0])
label = int(proba >= threshold)
latency_ms = int((time.time() - t0) * 1000)
out = {
"score": round(proba, 6),
"label": label,
"threshold": threshold,
"latency_ms": latency_ms,
"inputs": row
}
return json.dumps(out, indent=2)
def score_batch(file, threshold):
"""
Accepts a CSV with columns matching your expected schema.
Extra columns are ignored. Missing are filled with zero.
"""
booster = load_model()
try:
df = pd.read_csv(file.name)
except Exception:
return None, "Could not read CSV"
df_in = df.copy()
if USE_TEXT_PIPELINE:
# If you need to build numeric features from text columns
if set(TEXT_INPUTS).issubset(df_in.columns):
text_feats = df_in.apply(
lambda r: pd.Series(text_to_features(
r.get("listing_description", ""),
r.get("agent_bio", "")
)),
axis=1
)
df_in = pd.concat([df_in, text_feats], axis=1)
X = coerce_and_fill(df_in)
dmatrix = xgb.DMatrix(X)
probs = booster.predict(dmatrix)
out = df.copy()
out["score"] = probs
out["label"] = (out["score"] >= threshold).astype(int)
# Save to a temp file for download
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
out.to_csv(tmp.name, index=False)
return tmp.name, f"Scored {len(out)} rows"
# -----------------------------
# UI
# -----------------------------
with gr.Blocks(title="Open Pair Scorer") as demo:
gr.Markdown("# Open Pair Scorer\nSingle prediction and batch scoring in one place.")
with gr.Tab("Single"):
with gr.Row():
price = gr.Number(label="price", value=350000, scale=1, min_width=160)
beds = gr.Number(label="beds", value=3, scale=1, min_width=160)
baths = gr.Number(label="baths", value=2, scale=1, min_width=160)
sqft = gr.Number(label="sqft", value=1800, scale=1, min_width=160)
with gr.Row():
year_built = gr.Number(label="year_built", value=2005, scale=1, min_width=160)
agent_experience_years = gr.Number(label="agent_experience_years", value=5, scale=1, min_width=160)
agent_transactions_12m = gr.Number(label="agent_transactions_12m", value=18, scale=1, min_width=160)
distance_km = gr.Number(label="distance_km", value=4.2, scale=1, min_width=160)
with gr.Row():
threshold = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01, scale=1, min_width=160)
with gr.Row(visible=USE_TEXT_PIPELINE):
listing_description = gr.Textbox(label="listing_description", lines=6, placeholder="Paste listing description", scale=1)
agent_bio = gr.Textbox(label="agent_bio", lines=6, placeholder="Paste agent bio", scale=1)
btn = gr.Button("Score")
output = gr.Code(label="Result JSON", language="json")
if USE_TEXT_PIPELINE:
btn.click(
score_one,
inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold, listing_description, agent_bio],
outputs=output
)
else:
btn.click(
score_one,
inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold],
outputs=output
)
with gr.Tab("Batch CSV"):
gr.Markdown("Upload a CSV. It should contain your model feature columns. Extra columns are fine.")
file_in = gr.File(label="CSV file", file_types=[".csv"])
threshold_b = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01)
run_b = gr.Button("Score batch")
file_out = gr.File(label="Download scored CSV")
status = gr.Markdown()
run_b.click(score_batch, inputs=[file_in, threshold_b], outputs=[file_out, status])
gr.Markdown("Tip: set HF_MODEL_REPO and HF_MODEL_FILENAME in Space Secrets to pull your model from the Hub.")
if __name__ == "__main__":
demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))