nansri's picture
Upload folder using huggingface_hub
1038117 verified
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from huggingface_hub import hf_hub_download
from datasets import load_dataset
# -----------------------------
# CONFIG (edit if needed)
# -----------------------------
HF_MODEL_REPO = "nansri/wellness-purchase-predictor"
MODEL_FILENAME = "best_model.joblib"
# Used only to build dropdown options + default values
HF_DATASET_REPO = "nansri/visit-with-us-wellness"
TRAIN_FILE = "processed/train.csv"
st.set_page_config(page_title="Wellness Package Predictor", layout="centered")
# -----------------------------
# Load model from HF hub
# -----------------------------
@st.cache_resource
def load_model():
model_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=MODEL_FILENAME, repo_type="model")
return joblib.load(model_path)
# -----------------------------
# Load metadata (optional but helpful)
# -----------------------------
@st.cache_data
def load_train_metadata():
ds = load_dataset(HF_DATASET_REPO, data_files={"train": TRAIN_FILE})
train_df = ds["train"].to_pandas()
# feature columns (exclude target)
feature_cols = [c for c in train_df.columns if c != "ProdTaken"]
# identify numeric vs categorical
num_cols = train_df[feature_cols].select_dtypes(include=np.number).columns.tolist()
cat_cols = [c for c in feature_cols if c not in num_cols]
# defaults
medians = {c: float(train_df[c].median()) for c in num_cols}
modes = {c: str(train_df[c].mode(dropna=True).iloc[0]) if train_df[c].notna().any() else "" for c in cat_cols}
# categories for dropdown
categories = {c: sorted([str(x) for x in train_df[c].dropna().unique().tolist()]) for c in cat_cols}
return feature_cols, num_cols, cat_cols, medians, modes, categories
model = load_model()
feature_cols, num_cols, cat_cols, medians, modes, categories = load_train_metadata()
# -----------------------------
# UI
# -----------------------------
st.title("Wellness Tourism Package Purchase Predictor")
st.write("Enter customer details to predict likelihood of purchasing the Wellness Tourism Package.")
inputs = {}
with st.form("input_form"):
st.subheader("Customer & Interaction Inputs")
# Numeric inputs
st.markdown("**Numeric Features**")
for col in num_cols:
default_val = medians.get(col, 0.0)
# int-like columns can still be float in data; allow float entry safely
inputs[col] = st.number_input(col, value=float(default_val))
# Categorical inputs
st.markdown("**Categorical Features**")
for col in cat_cols:
opts = categories.get(col, [])
default_opt = modes.get(col, opts[0] if opts else "")
if default_opt not in opts and opts:
default_opt = opts[0]
if opts:
inputs[col] = st.selectbox(col, options=opts, index=opts.index(default_opt))
else:
inputs[col] = st.text_input(col, value=default_opt)
submitted = st.form_submit_button("Predict")
if submitted:
# Create dataframe from inputs (rubric requirement)
input_df = pd.DataFrame([inputs], columns=feature_cols)
st.write("### Input DataFrame")
st.dataframe(input_df)
# Predict
try:
proba = model.predict_proba(input_df)[:, 1][0]
pred = int(proba >= 0.5)
st.success(f"Prediction (ProdTaken): {pred} | Purchase Probability: {proba:.3f}")
except Exception:
pred = int(model.predict(input_df)[0])
st.success(f"Prediction (ProdTaken): {pred}")