Spaces:
Sleeping
Sleeping
File size: 7,728 Bytes
ad9ba86 df1616e ad9ba86 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | import os
import json
import time
import tempfile
import pandas as pd
import xgboost as xgb
import gradio as gr
try:
from huggingface_hub import hf_hub_download
except Exception:
hf_hub_download = None
# -----------------------------
# Config
# -----------------------------
MODEL_LOCAL_PATH = os.getenv("MODEL_LOCAL_PATH", "model.xgb")
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "") # e.g. "tlogandesigns/open-pair-model"
HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "model.xgb")
THRESH_DEFAULT = float(os.getenv("THRESH_DEFAULT", "0.50"))
# If you engineer features from text, set a flag and implement text_to_features
USE_TEXT_PIPELINE = os.getenv("USE_TEXT_PIPELINE", "0") == "1"
# -----------------------------
# Model loading
# -----------------------------
_model = None
def load_model():
global _model
if _model is not None:
return _model
path = MODEL_LOCAL_PATH
if HF_MODEL_REPO and hf_hub_download:
try:
path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=HF_MODEL_FILENAME)
except Exception as e:
print(f"Hub download failed: {e}. Falling back to local path: {MODEL_LOCAL_PATH}")
booster = xgb.Booster()
booster.load_model(path)
_model = booster
return _model
# -----------------------------
# Feature engineering stubs
# -----------------------------
NUMERIC_COLUMNS = [
# Replace with your production feature columns in the correct order
# Example placeholders:
"price", "beds", "baths", "sqft", "year_built",
"agent_experience_years", "agent_transactions_12m", "distance_km"
]
TEXT_INPUTS = [
# Examples if you do text based features
"listing_description", "agent_bio"
]
def text_to_features(listing_description: str, agent_bio: str) -> dict:
"""
Replace with your real text feature extraction.
Return a dict where keys match NUMERIC_COLUMNS that depend on text.
"""
# Example toy features based on length
return {
"desc_len": len(listing_description or ""),
"bio_len": len(agent_bio or "")
}
def coerce_and_fill(df: pd.DataFrame) -> pd.DataFrame:
# Ensure all required columns exist. Fill missing with zeros to avoid crashes.
for col in NUMERIC_COLUMNS:
if col not in df.columns:
df[col] = 0
# Keep only columns in the expected order
df = df[NUMERIC_COLUMNS]
# Numeric coercion
for c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
return df
# -----------------------------
# Inference
# -----------------------------
def score_one(
price,
beds,
baths,
sqft,
year_built,
agent_experience_years,
agent_transactions_12m,
distance_km,
threshold,
listing_description="",
agent_bio=""
):
t0 = time.time()
booster = load_model()
row = {
"price": price,
"beds": beds,
"baths": baths,
"sqft": sqft,
"year_built": year_built,
"agent_experience_years": agent_experience_years,
"agent_transactions_12m": agent_transactions_12m,
"distance_km": distance_km,
}
if USE_TEXT_PIPELINE:
row.update(text_to_features(listing_description, agent_bio))
df = pd.DataFrame([row])
df = coerce_and_fill(df)
dmatrix = xgb.DMatrix(df)
proba = float(booster.predict(dmatrix)[0])
label = int(proba >= threshold)
latency_ms = int((time.time() - t0) * 1000)
out = {
"score": round(proba, 6),
"label": label,
"threshold": threshold,
"latency_ms": latency_ms,
"inputs": row
}
return json.dumps(out, indent=2)
def score_batch(file, threshold):
"""
Accepts a CSV with columns matching your expected schema.
Extra columns are ignored. Missing are filled with zero.
"""
booster = load_model()
try:
df = pd.read_csv(file.name)
except Exception:
return None, "Could not read CSV"
df_in = df.copy()
if USE_TEXT_PIPELINE:
# If you need to build numeric features from text columns
if set(TEXT_INPUTS).issubset(df_in.columns):
text_feats = df_in.apply(
lambda r: pd.Series(text_to_features(
r.get("listing_description", ""),
r.get("agent_bio", "")
)),
axis=1
)
df_in = pd.concat([df_in, text_feats], axis=1)
X = coerce_and_fill(df_in)
dmatrix = xgb.DMatrix(X)
probs = booster.predict(dmatrix)
out = df.copy()
out["score"] = probs
out["label"] = (out["score"] >= threshold).astype(int)
# Save to a temp file for download
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
out.to_csv(tmp.name, index=False)
return tmp.name, f"Scored {len(out)} rows"
# -----------------------------
# UI
# -----------------------------
with gr.Blocks(title="Open Pair Scorer") as demo:
gr.Markdown("# Open Pair Scorer\nSingle prediction and batch scoring in one place.")
with gr.Tab("Single"):
with gr.Row():
price = gr.Number(label="price", value=350000, scale=1, min_width=160)
beds = gr.Number(label="beds", value=3, scale=1, min_width=160)
baths = gr.Number(label="baths", value=2, scale=1, min_width=160)
sqft = gr.Number(label="sqft", value=1800, scale=1, min_width=160)
with gr.Row():
year_built = gr.Number(label="year_built", value=2005, scale=1, min_width=160)
agent_experience_years = gr.Number(label="agent_experience_years", value=5, scale=1, min_width=160)
agent_transactions_12m = gr.Number(label="agent_transactions_12m", value=18, scale=1, min_width=160)
distance_km = gr.Number(label="distance_km", value=4.2, scale=1, min_width=160)
with gr.Row():
threshold = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01, scale=1, min_width=160)
with gr.Row(visible=USE_TEXT_PIPELINE):
listing_description = gr.Textbox(label="listing_description", lines=6, placeholder="Paste listing description", scale=1)
agent_bio = gr.Textbox(label="agent_bio", lines=6, placeholder="Paste agent bio", scale=1)
btn = gr.Button("Score")
output = gr.Code(label="Result JSON", language="json")
if USE_TEXT_PIPELINE:
btn.click(
score_one,
inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold, listing_description, agent_bio],
outputs=output
)
else:
btn.click(
score_one,
inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold],
outputs=output
)
with gr.Tab("Batch CSV"):
gr.Markdown("Upload a CSV. It should contain your model feature columns. Extra columns are fine.")
file_in = gr.File(label="CSV file", file_types=[".csv"])
threshold_b = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01)
run_b = gr.Button("Score batch")
file_out = gr.File(label="Download scored CSV")
status = gr.Markdown()
run_b.click(score_batch, inputs=[file_in, threshold_b], outputs=[file_out, status])
gr.Markdown("Tip: set HF_MODEL_REPO and HF_MODEL_FILENAME in Space Secrets to pull your model from the Hub.")
if __name__ == "__main__":
demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
|