File size: 5,993 Bytes
395fcc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import json
import joblib
import numpy as np
import pandas as pd
import streamlit as st
from huggingface_hub import hf_hub_download

# -------------------------
# CONFIG (EDIT IF NEEDED)
# -------------------------
HF_MODEL_REPO = "VJBharathkumar/tourism-prod-prediction"     # <-- your model repo on HF
HF_DATASET_REPO = "VJBharathkumar/tourism-wellness"          # <-- your dataset repo on HF
MODEL_FILENAME = "model.joblib"
METRICS_FILENAME = "metrics.json"
TRAIN_FILENAME_IN_DATASET = "train.csv"                      # uploaded in Step 5

TARGET = "ProdTaken"

# These are the expected feature columns (18) from your dataset
FEATURE_COLS = [
    "Age",
    "TypeofContact",
    "CityTier",
    "DurationOfPitch",
    "Occupation",
    "Gender",
    "NumberOfPersonVisiting",
    "NumberOfFollowups",
    "ProductPitched",
    "PreferredPropertyStar",
    "MaritalStatus",
    "NumberOfTrips",
    "Passport",
    "PitchSatisfactionScore",
    "OwnCar",
    "NumberOfChildrenVisiting",
    "Designation",
    "MonthlyIncome",
]

@st.cache_resource
def load_model_and_metadata():
    model_path = hf_hub_download(
        repo_id=HF_MODEL_REPO,
        filename=MODEL_FILENAME,
        repo_type="model",
    )
    model = joblib.load(model_path)

    metrics = None
    try:
        metrics_path = hf_hub_download(
            repo_id=HF_MODEL_REPO,
            filename=METRICS_FILENAME,
            repo_type="model",
        )
        with open(metrics_path, "r", encoding="utf-8") as f:
            metrics = json.load(f)
    except Exception:
        metrics = None

    return model, metrics

@st.cache_data
def load_train_for_ui_hints():
    """
    Pull train.csv from HF dataset repo to:
    - get dropdown options for categorical columns
    - get min/max for numeric sliders
    """
    train_path = hf_hub_download(
        repo_id=HF_DATASET_REPO,
        filename=TRAIN_FILENAME_IN_DATASET,
        repo_type="dataset",
    )
    df = pd.read_csv(train_path)

    # If ProdTaken exists, drop it for UI feature work
    if TARGET in df.columns:
        df = df.drop(columns=[TARGET])

    # Keep only expected features (protects against accidental extra columns)
    df = df[[c for c in FEATURE_COLS if c in df.columns]].copy()
    return df

def build_input_form(train_df: pd.DataFrame) -> pd.DataFrame:
    st.subheader("Enter customer details")

    # Determine categorical vs numeric from training df
    cat_cols = train_df.select_dtypes(include=["object"]).columns.tolist()
    num_cols = [c for c in train_df.columns if c not in cat_cols]

    left, right = st.columns(2)
    values = {}

    # Helper to draw widget
    def draw_widget(col_name, container):
        if col_name in cat_cols:
            options = sorted([x for x in train_df[col_name].dropna().unique().tolist()])
            default = options[0] if options else ""
            values[col_name] = container.selectbox(col_name, options=options, index=0)
        else:
            # numeric
            series = pd.to_numeric(train_df[col_name], errors="coerce")
            min_v = float(np.nanmin(series.values))
            max_v = float(np.nanmax(series.values))
            med_v = float(np.nanmedian(series.values))

            # If it's basically an integer field, use number_input with step 1
            if np.all(np.isclose(series.dropna() % 1, 0)):
                values[col_name] = container.number_input(
                    col_name,
                    min_value=int(min_v),
                    max_value=int(max_v),
                    value=int(round(med_v)),
                    step=1,
                )
            else:
                values[col_name] = container.number_input(
                    col_name,
                    min_value=float(min_v),
                    max_value=float(max_v),
                    value=float(med_v),
                )

    # Alternate columns for nicer layout
    for i, col_name in enumerate(FEATURE_COLS):
        if col_name not in train_df.columns:
            continue
        container = left if i % 2 == 0 else right
        draw_widget(col_name, container)

    input_df = pd.DataFrame([values], columns=[c for c in FEATURE_COLS if c in values])
    return input_df

def main():
    st.set_page_config(page_title="Tourism Package Prediction", layout="wide")

    st.title("Tourism Package Prediction")
    st.write("Predict whether the customer will take the package (`ProdTaken = 1`).")

    model, metrics = load_model_and_metadata()
    train_df = load_train_for_ui_hints()

    # Sidebar: show metrics + model info
    with st.sidebar:
        st.header("Model Info")
        st.write(f"Model repo: `{HF_MODEL_REPO}`")
        if metrics:
            st.subheader("Test Metrics")
            st.write(f"Accuracy: **{metrics.get('accuracy', 'NA')}**")
            st.write(f"F1: **{metrics.get('f1', 'NA')}**")
            st.write(f"ROC-AUC: **{metrics.get('roc_auc', 'NA')}**")
        else:
            st.info("metrics.json not found in model repo (optional).")

    input_df = build_input_form(train_df)

    st.divider()

    predict_btn = st.button("Predict", type="primary")

    if predict_btn:
        # Ensure column order matches training expectation
        input_df = input_df[[c for c in FEATURE_COLS if c in input_df.columns]].copy()

        proba = None
        pred = None

        # Some sklearn models support predict_proba; our pipeline does
        pred = int(model.predict(input_df)[0])
        proba = float(model.predict_proba(input_df)[0][1])

        st.subheader("Prediction")
        st.write(f"Predicted class: **{pred}**  (1 = will take package, 0 = will not)")
        st.write(f"Probability of ProdTaken=1: **{proba:.3f}**")

        if pred == 1:
            st.success("Likely to take the package ✅")
        else:
            st.warning("Unlikely to take the package ⚠️")

        with st.expander("Show input row"):
            st.dataframe(input_df)

if __name__ == "__main__":
    main()