Spaces:

tlogandesigns
/

open-house-pairing

Sleeping

App Files Files Community

open-house-pairing / app.py

tlogandesigns

ui updates

df1616e 5 months ago

raw

history blame contribute delete

7.73 kB

	import os
	import json
	import time
	import tempfile
	import pandas as pd
	import xgboost as xgb
	import gradio as gr

	try:
	from huggingface_hub import hf_hub_download
	except Exception:
	hf_hub_download = None

	# -----------------------------
	# Config
	# -----------------------------
	MODEL_LOCAL_PATH = os.getenv("MODEL_LOCAL_PATH", "model.xgb")
	HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "") # e.g. "tlogandesigns/open-pair-model"
	HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "model.xgb")
	THRESH_DEFAULT = float(os.getenv("THRESH_DEFAULT", "0.50"))

	# If you engineer features from text, set a flag and implement text_to_features
	USE_TEXT_PIPELINE = os.getenv("USE_TEXT_PIPELINE", "0") == "1"

	# -----------------------------
	# Model loading
	# -----------------------------
	_model = None

	def load_model():
	global _model
	if _model is not None:
	return _model

	path = MODEL_LOCAL_PATH
	if HF_MODEL_REPO and hf_hub_download:
	try:
	path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=HF_MODEL_FILENAME)
	except Exception as e:
	print(f"Hub download failed: {e}. Falling back to local path: {MODEL_LOCAL_PATH}")

	booster = xgb.Booster()
	booster.load_model(path)
	_model = booster
	return _model

	# -----------------------------
	# Feature engineering stubs
	# -----------------------------
	NUMERIC_COLUMNS = [
	# Replace with your production feature columns in the correct order
	# Example placeholders:
	"price", "beds", "baths", "sqft", "year_built",
	"agent_experience_years", "agent_transactions_12m", "distance_km"
	]

	TEXT_INPUTS = [
	# Examples if you do text based features
	"listing_description", "agent_bio"
	]

	def text_to_features(listing_description: str, agent_bio: str) -> dict:
	"""
	Replace with your real text feature extraction.
	Return a dict where keys match NUMERIC_COLUMNS that depend on text.
	"""
	# Example toy features based on length
	return {
	"desc_len": len(listing_description or ""),
	"bio_len": len(agent_bio or "")
	}

	def coerce_and_fill(df: pd.DataFrame) -> pd.DataFrame:
	# Ensure all required columns exist. Fill missing with zeros to avoid crashes.
	for col in NUMERIC_COLUMNS:
	if col not in df.columns:
	df[col] = 0
	# Keep only columns in the expected order
	df = df[NUMERIC_COLUMNS]
	# Numeric coercion
	for c in df.columns:
	df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
	return df

	# -----------------------------
	# Inference
	# -----------------------------
	def score_one(
	price,
	beds,
	baths,
	sqft,
	year_built,
	agent_experience_years,
	agent_transactions_12m,
	distance_km,
	threshold,
	listing_description="",
	agent_bio=""
	):
	t0 = time.time()
	booster = load_model()

	row = {
	"price": price,
	"beds": beds,
	"baths": baths,
	"sqft": sqft,
	"year_built": year_built,
	"agent_experience_years": agent_experience_years,
	"agent_transactions_12m": agent_transactions_12m,
	"distance_km": distance_km,
	}

	if USE_TEXT_PIPELINE:
	row.update(text_to_features(listing_description, agent_bio))

	df = pd.DataFrame([row])
	df = coerce_and_fill(df)
	dmatrix = xgb.DMatrix(df)

	proba = float(booster.predict(dmatrix)[0])
	label = int(proba >= threshold)
	latency_ms = int((time.time() - t0) * 1000)

	out = {
	"score": round(proba, 6),
	"label": label,
	"threshold": threshold,
	"latency_ms": latency_ms,
	"inputs": row
	}
	return json.dumps(out, indent=2)

	def score_batch(file, threshold):
	"""
	Accepts a CSV with columns matching your expected schema.
	Extra columns are ignored. Missing are filled with zero.
	"""
	booster = load_model()
	try:
	df = pd.read_csv(file.name)
	except Exception:
	return None, "Could not read CSV"

	df_in = df.copy()
	if USE_TEXT_PIPELINE:
	# If you need to build numeric features from text columns
	if set(TEXT_INPUTS).issubset(df_in.columns):
	text_feats = df_in.apply(
	lambda r: pd.Series(text_to_features(
	r.get("listing_description", ""),
	r.get("agent_bio", "")
	)),
	axis=1
	)
	df_in = pd.concat([df_in, text_feats], axis=1)

	X = coerce_and_fill(df_in)
	dmatrix = xgb.DMatrix(X)
	probs = booster.predict(dmatrix)

	out = df.copy()
	out["score"] = probs
	out["label"] = (out["score"] >= threshold).astype(int)

	# Save to a temp file for download
	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
	out.to_csv(tmp.name, index=False)
	return tmp.name, f"Scored {len(out)} rows"

	# -----------------------------
	# UI
	# -----------------------------
	with gr.Blocks(title="Open Pair Scorer") as demo:
	gr.Markdown("# Open Pair Scorer\nSingle prediction and batch scoring in one place.")

	with gr.Tab("Single"):
	with gr.Row():
	price = gr.Number(label="price", value=350000, scale=1, min_width=160)
	beds = gr.Number(label="beds", value=3, scale=1, min_width=160)
	baths = gr.Number(label="baths", value=2, scale=1, min_width=160)
	sqft = gr.Number(label="sqft", value=1800, scale=1, min_width=160)
	with gr.Row():
	year_built = gr.Number(label="year_built", value=2005, scale=1, min_width=160)
	agent_experience_years = gr.Number(label="agent_experience_years", value=5, scale=1, min_width=160)
	agent_transactions_12m = gr.Number(label="agent_transactions_12m", value=18, scale=1, min_width=160)
	distance_km = gr.Number(label="distance_km", value=4.2, scale=1, min_width=160)
	with gr.Row():
	threshold = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01, scale=1, min_width=160)
	with gr.Row(visible=USE_TEXT_PIPELINE):
	listing_description = gr.Textbox(label="listing_description", lines=6, placeholder="Paste listing description", scale=1)
	agent_bio = gr.Textbox(label="agent_bio", lines=6, placeholder="Paste agent bio", scale=1)

	btn = gr.Button("Score")
	output = gr.Code(label="Result JSON", language="json")
	if USE_TEXT_PIPELINE:
	btn.click(
	score_one,
	inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold, listing_description, agent_bio],
	outputs=output
	)
	else:
	btn.click(
	score_one,
	inputs=[price, beds, baths, sqft, year_built, agent_experience_years, agent_transactions_12m, distance_km, threshold],
	outputs=output
	)

	with gr.Tab("Batch CSV"):
	gr.Markdown("Upload a CSV. It should contain your model feature columns. Extra columns are fine.")
	file_in = gr.File(label="CSV file", file_types=[".csv"])
	threshold_b = gr.Slider(label="decision threshold", value=THRESH_DEFAULT, minimum=0.0, maximum=1.0, step=0.01)
	run_b = gr.Button("Score batch")
	file_out = gr.File(label="Download scored CSV")
	status = gr.Markdown()
	run_b.click(score_batch, inputs=[file_in, threshold_b], outputs=[file_out, status])

	gr.Markdown("Tip: set HF_MODEL_REPO and HF_MODEL_FILENAME in Space Secrets to pull your model from the Hub.")

	if __name__ == "__main__":
	demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))