VJBharathkumar's picture
Upload 2 files
395fcc8 verified
import json
import joblib
import numpy as np
import pandas as pd
import streamlit as st
from huggingface_hub import hf_hub_download
# -------------------------
# CONFIG (EDIT IF NEEDED)
# -------------------------
HF_MODEL_REPO = "VJBharathkumar/tourism-prod-prediction" # <-- your model repo on HF
HF_DATASET_REPO = "VJBharathkumar/tourism-wellness" # <-- your dataset repo on HF
MODEL_FILENAME = "model.joblib"
METRICS_FILENAME = "metrics.json"
TRAIN_FILENAME_IN_DATASET = "train.csv" # uploaded in Step 5
TARGET = "ProdTaken"
# These are the expected feature columns (18) from your dataset
FEATURE_COLS = [
"Age",
"TypeofContact",
"CityTier",
"DurationOfPitch",
"Occupation",
"Gender",
"NumberOfPersonVisiting",
"NumberOfFollowups",
"ProductPitched",
"PreferredPropertyStar",
"MaritalStatus",
"NumberOfTrips",
"Passport",
"PitchSatisfactionScore",
"OwnCar",
"NumberOfChildrenVisiting",
"Designation",
"MonthlyIncome",
]
@st.cache_resource
def load_model_and_metadata():
model_path = hf_hub_download(
repo_id=HF_MODEL_REPO,
filename=MODEL_FILENAME,
repo_type="model",
)
model = joblib.load(model_path)
metrics = None
try:
metrics_path = hf_hub_download(
repo_id=HF_MODEL_REPO,
filename=METRICS_FILENAME,
repo_type="model",
)
with open(metrics_path, "r", encoding="utf-8") as f:
metrics = json.load(f)
except Exception:
metrics = None
return model, metrics
@st.cache_data
def load_train_for_ui_hints():
"""
Pull train.csv from HF dataset repo to:
- get dropdown options for categorical columns
- get min/max for numeric sliders
"""
train_path = hf_hub_download(
repo_id=HF_DATASET_REPO,
filename=TRAIN_FILENAME_IN_DATASET,
repo_type="dataset",
)
df = pd.read_csv(train_path)
# If ProdTaken exists, drop it for UI feature work
if TARGET in df.columns:
df = df.drop(columns=[TARGET])
# Keep only expected features (protects against accidental extra columns)
df = df[[c for c in FEATURE_COLS if c in df.columns]].copy()
return df
def build_input_form(train_df: pd.DataFrame) -> pd.DataFrame:
st.subheader("Enter customer details")
# Determine categorical vs numeric from training df
cat_cols = train_df.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in train_df.columns if c not in cat_cols]
left, right = st.columns(2)
values = {}
# Helper to draw widget
def draw_widget(col_name, container):
if col_name in cat_cols:
options = sorted([x for x in train_df[col_name].dropna().unique().tolist()])
default = options[0] if options else ""
values[col_name] = container.selectbox(col_name, options=options, index=0)
else:
# numeric
series = pd.to_numeric(train_df[col_name], errors="coerce")
min_v = float(np.nanmin(series.values))
max_v = float(np.nanmax(series.values))
med_v = float(np.nanmedian(series.values))
# If it's basically an integer field, use number_input with step 1
if np.all(np.isclose(series.dropna() % 1, 0)):
values[col_name] = container.number_input(
col_name,
min_value=int(min_v),
max_value=int(max_v),
value=int(round(med_v)),
step=1,
)
else:
values[col_name] = container.number_input(
col_name,
min_value=float(min_v),
max_value=float(max_v),
value=float(med_v),
)
# Alternate columns for nicer layout
for i, col_name in enumerate(FEATURE_COLS):
if col_name not in train_df.columns:
continue
container = left if i % 2 == 0 else right
draw_widget(col_name, container)
input_df = pd.DataFrame([values], columns=[c for c in FEATURE_COLS if c in values])
return input_df
def main():
st.set_page_config(page_title="Tourism Package Prediction", layout="wide")
st.title("Tourism Package Prediction")
st.write("Predict whether the customer will take the package (`ProdTaken = 1`).")
model, metrics = load_model_and_metadata()
train_df = load_train_for_ui_hints()
# Sidebar: show metrics + model info
with st.sidebar:
st.header("Model Info")
st.write(f"Model repo: `{HF_MODEL_REPO}`")
if metrics:
st.subheader("Test Metrics")
st.write(f"Accuracy: **{metrics.get('accuracy', 'NA')}**")
st.write(f"F1: **{metrics.get('f1', 'NA')}**")
st.write(f"ROC-AUC: **{metrics.get('roc_auc', 'NA')}**")
else:
st.info("metrics.json not found in model repo (optional).")
input_df = build_input_form(train_df)
st.divider()
predict_btn = st.button("Predict", type="primary")
if predict_btn:
# Ensure column order matches training expectation
input_df = input_df[[c for c in FEATURE_COLS if c in input_df.columns]].copy()
proba = None
pred = None
# Some sklearn models support predict_proba; our pipeline does
pred = int(model.predict(input_df)[0])
proba = float(model.predict_proba(input_df)[0][1])
st.subheader("Prediction")
st.write(f"Predicted class: **{pred}** (1 = will take package, 0 = will not)")
st.write(f"Probability of ProdTaken=1: **{proba:.3f}**")
if pred == 1:
st.success("Likely to take the package ✅")
else:
st.warning("Unlikely to take the package ⚠️")
with st.expander("Show input row"):
st.dataframe(input_df)
if __name__ == "__main__":
main()