Spaces:

VJBharathkumar
/

Tourism-Prediction

Sleeping

File size: 5,993 Bytes

395fcc8

import json
import joblib
import numpy as np
import pandas as pd
import streamlit as st
from huggingface_hub import hf_hub_download

# -------------------------
# CONFIG (EDIT IF NEEDED)
# -------------------------
HF_MODEL_REPO = "VJBharathkumar/tourism-prod-prediction"     # <-- your model repo on HF
HF_DATASET_REPO = "VJBharathkumar/tourism-wellness"          # <-- your dataset repo on HF
MODEL_FILENAME = "model.joblib"
METRICS_FILENAME = "metrics.json"
TRAIN_FILENAME_IN_DATASET = "train.csv"                      # uploaded in Step 5

TARGET = "ProdTaken"

# These are the expected feature columns (18) from your dataset
FEATURE_COLS = [
    "Age",
    "TypeofContact",
    "CityTier",
    "DurationOfPitch",
    "Occupation",
    "Gender",
    "NumberOfPersonVisiting",
    "NumberOfFollowups",
    "ProductPitched",
    "PreferredPropertyStar",
    "MaritalStatus",
    "NumberOfTrips",
    "Passport",
    "PitchSatisfactionScore",
    "OwnCar",
    "NumberOfChildrenVisiting",
    "Designation",
    "MonthlyIncome",
]

@st.cache_resource
def load_model_and_metadata():
    model_path = hf_hub_download(
        repo_id=HF_MODEL_REPO,
        filename=MODEL_FILENAME,
        repo_type="model",
    )
    model = joblib.load(model_path)

    metrics = None
    try:
        metrics_path = hf_hub_download(
            repo_id=HF_MODEL_REPO,
            filename=METRICS_FILENAME,
            repo_type="model",
        )
        with open(metrics_path, "r", encoding="utf-8") as f:
            metrics = json.load(f)
    except Exception:
        metrics = None

    return model, metrics

@st.cache_data
def load_train_for_ui_hints():
    """
    Pull train.csv from HF dataset repo to:
    - get dropdown options for categorical columns
    - get min/max for numeric sliders
    """
    train_path = hf_hub_download(
        repo_id=HF_DATASET_REPO,
        filename=TRAIN_FILENAME_IN_DATASET,
        repo_type="dataset",
    )
    df = pd.read_csv(train_path)

    # If ProdTaken exists, drop it for UI feature work
    if TARGET in df.columns:
        df = df.drop(columns=[TARGET])

    # Keep only expected features (protects against accidental extra columns)
    df = df[[c for c in FEATURE_COLS if c in df.columns]].copy()
    return df

def build_input_form(train_df: pd.DataFrame) -> pd.DataFrame:
    st.subheader("Enter customer details")

    # Determine categorical vs numeric from training df
    cat_cols = train_df.select_dtypes(include=["object"]).columns.tolist()
    num_cols = [c for c in train_df.columns if c not in cat_cols]

    left, right = st.columns(2)
    values = {}

    # Helper to draw widget
    def draw_widget(col_name, container):
        if col_name in cat_cols:
            options = sorted([x for x in train_df[col_name].dropna().unique().tolist()])
            default = options[0] if options else ""
            values[col_name] = container.selectbox(col_name, options=options, index=0)
        else:
            # numeric
            series = pd.to_numeric(train_df[col_name], errors="coerce")
            min_v = float(np.nanmin(series.values))
            max_v = float(np.nanmax(series.values))
            med_v = float(np.nanmedian(series.values))

            # If it's basically an integer field, use number_input with step 1
            if np.all(np.isclose(series.dropna() % 1, 0)):
                values[col_name] = container.number_input(
                    col_name,
                    min_value=int(min_v),
                    max_value=int(max_v),
                    value=int(round(med_v)),
                    step=1,
                )
            else:
                values[col_name] = container.number_input(
                    col_name,
                    min_value=float(min_v),
                    max_value=float(max_v),
                    value=float(med_v),
                )

    # Alternate columns for nicer layout
    for i, col_name in enumerate(FEATURE_COLS):
        if col_name not in train_df.columns:
            continue
        container = left if i % 2 == 0 else right
        draw_widget(col_name, container)

    input_df = pd.DataFrame([values], columns=[c for c in FEATURE_COLS if c in values])
    return input_df

def main():
    st.set_page_config(page_title="Tourism Package Prediction", layout="wide")

    st.title("Tourism Package Prediction")
    st.write("Predict whether the customer will take the package (`ProdTaken = 1`).")

    model, metrics = load_model_and_metadata()
    train_df = load_train_for_ui_hints()

    # Sidebar: show metrics + model info
    with st.sidebar:
        st.header("Model Info")
        st.write(f"Model repo: `{HF_MODEL_REPO}`")
        if metrics:
            st.subheader("Test Metrics")
            st.write(f"Accuracy: **{metrics.get('accuracy', 'NA')}**")
            st.write(f"F1: **{metrics.get('f1', 'NA')}**")
            st.write(f"ROC-AUC: **{metrics.get('roc_auc', 'NA')}**")
        else:
            st.info("metrics.json not found in model repo (optional).")

    input_df = build_input_form(train_df)

    st.divider()

    predict_btn = st.button("Predict", type="primary")

    if predict_btn:
        # Ensure column order matches training expectation
        input_df = input_df[[c for c in FEATURE_COLS if c in input_df.columns]].copy()

        proba = None
        pred = None

        # Some sklearn models support predict_proba; our pipeline does
        pred = int(model.predict(input_df)[0])
        proba = float(model.predict_proba(input_df)[0][1])

        st.subheader("Prediction")
        st.write(f"Predicted class: **{pred}**  (1 = will take package, 0 = will not)")
        st.write(f"Probability of ProdTaken=1: **{proba:.3f}**")

        if pred == 1:
            st.success("Likely to take the package ✅")
        else:
            st.warning("Unlikely to take the package ⚠️")

        with st.expander("Show input row"):
            st.dataframe(input_df)

if __name__ == "__main__":
    main()