import streamlit as st import pandas as pd import numpy as np import joblib from huggingface_hub import hf_hub_download from datasets import load_dataset # ----------------------------- # CONFIG (edit if needed) # ----------------------------- HF_MODEL_REPO = "nansri/wellness-purchase-predictor" MODEL_FILENAME = "best_model.joblib" # Used only to build dropdown options + default values HF_DATASET_REPO = "nansri/visit-with-us-wellness" TRAIN_FILE = "processed/train.csv" st.set_page_config(page_title="Wellness Package Predictor", layout="centered") # ----------------------------- # Load model from HF hub # ----------------------------- @st.cache_resource def load_model(): model_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=MODEL_FILENAME, repo_type="model") return joblib.load(model_path) # ----------------------------- # Load metadata (optional but helpful) # ----------------------------- @st.cache_data def load_train_metadata(): ds = load_dataset(HF_DATASET_REPO, data_files={"train": TRAIN_FILE}) train_df = ds["train"].to_pandas() # feature columns (exclude target) feature_cols = [c for c in train_df.columns if c != "ProdTaken"] # identify numeric vs categorical num_cols = train_df[feature_cols].select_dtypes(include=np.number).columns.tolist() cat_cols = [c for c in feature_cols if c not in num_cols] # defaults medians = {c: float(train_df[c].median()) for c in num_cols} modes = {c: str(train_df[c].mode(dropna=True).iloc[0]) if train_df[c].notna().any() else "" for c in cat_cols} # categories for dropdown categories = {c: sorted([str(x) for x in train_df[c].dropna().unique().tolist()]) for c in cat_cols} return feature_cols, num_cols, cat_cols, medians, modes, categories model = load_model() feature_cols, num_cols, cat_cols, medians, modes, categories = load_train_metadata() # ----------------------------- # UI # ----------------------------- st.title("Wellness Tourism Package Purchase Predictor") st.write("Enter customer details to predict likelihood of purchasing the Wellness Tourism Package.") inputs = {} with st.form("input_form"): st.subheader("Customer & Interaction Inputs") # Numeric inputs st.markdown("**Numeric Features**") for col in num_cols: default_val = medians.get(col, 0.0) # int-like columns can still be float in data; allow float entry safely inputs[col] = st.number_input(col, value=float(default_val)) # Categorical inputs st.markdown("**Categorical Features**") for col in cat_cols: opts = categories.get(col, []) default_opt = modes.get(col, opts[0] if opts else "") if default_opt not in opts and opts: default_opt = opts[0] if opts: inputs[col] = st.selectbox(col, options=opts, index=opts.index(default_opt)) else: inputs[col] = st.text_input(col, value=default_opt) submitted = st.form_submit_button("Predict") if submitted: # Create dataframe from inputs (rubric requirement) input_df = pd.DataFrame([inputs], columns=feature_cols) st.write("### Input DataFrame") st.dataframe(input_df) # Predict try: proba = model.predict_proba(input_df)[:, 1][0] pred = int(proba >= 0.5) st.success(f"Prediction (ProdTaken): {pred} | Purchase Probability: {proba:.3f}") except Exception: pred = int(model.predict(input_df)[0]) st.success(f"Prediction (ProdTaken): {pred}")