dhani10 commited on
Commit
6a07055
Β·
verified Β·
1 Parent(s): bff7093

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +61 -59
app.py CHANGED
@@ -1,20 +1,26 @@
1
  import os
2
  import joblib
3
- import pandas as pd
4
  import numpy as np
 
5
  import streamlit as st
6
  from huggingface_hub import hf_hub_download, login
7
 
8
- # MUST be the first Streamlit command
9
- st.set_page_config(page_title="Tourism Wellness Package Predictor", layout="centered")
10
-
11
- # ----------------------------
12
- # HF auth & writable cache (/tmp)
13
- # ----------------------------
14
- HF_TOKEN = os.getenv("HF_TOKEN") # Space secret if needed
 
 
 
 
 
15
  HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "dhani10/tourism-model")
16
  MODEL_FILE = os.getenv("MODEL_FILE", "model/best_model.joblib")
17
 
 
18
  HF_CACHE_ROOT = os.getenv("HF_HOME", "/tmp/huggingface")
19
  os.environ["HF_HOME"] = HF_CACHE_ROOT
20
  os.environ["HF_HUB_CACHE"] = os.path.join(HF_CACHE_ROOT, "hub")
@@ -22,12 +28,16 @@ os.environ["TRANSFORMERS_CACHE"] = os.path.join(HF_CACHE_ROOT, "transformers")
22
  for d in (HF_CACHE_ROOT, os.environ["HF_HUB_CACHE"], os.environ["TRANSFORMERS_CACHE"]):
23
  os.makedirs(d, exist_ok=True)
24
 
 
25
  if HF_TOKEN:
26
  try:
27
  login(token=HF_TOKEN)
28
  except Exception:
29
  pass
30
 
 
 
 
31
  @st.cache_resource
32
  def load_model():
33
  local_path = hf_hub_download(
@@ -41,14 +51,15 @@ def load_model():
41
 
42
  model = load_model()
43
 
44
- # ----------------------------
45
- # Helper: read expected input columns from preprocessor
46
- # ----------------------------
47
  def get_expected_input_columns(clf):
48
  pre = clf.named_steps.get("preprocessor")
49
  cols = []
50
  if pre is None:
51
  return cols
 
52
  transformers = getattr(pre, "transformers", None) or getattr(pre, "transformers_", [])
53
  for _, _, selected in transformers:
54
  if selected in (None, "drop"):
@@ -57,34 +68,37 @@ def get_expected_input_columns(clf):
57
  cols.extend(selected)
58
  elif isinstance(selected, (tuple, np.ndarray, pd.Index)):
59
  cols.extend(list(selected))
60
- # preserve order + unique
61
- seen = set()
62
- ordered = []
63
- for c in cols:
64
- if c not in seen:
65
- seen.add(c)
66
- ordered.append(c)
67
- return ordered
68
 
69
  EXPECTED_COLS = get_expected_input_columns(model)
70
 
71
- # ----------------------------
72
- # Streamlit UI (collect ALL training features)
73
- # ----------------------------
74
- st.set_page_config(page_title="Tourism Wellness Package Predictor", layout="centered")
75
- st.title("Wellness Tourism Package Predictor")
 
 
 
 
 
 
 
 
76
  st.caption("Fill in customer details to predict purchase likelihood.")
77
 
78
- # Categorical options β€” match dataset spellings exactly
79
- TYPE_OF_CONTACT_OPTS = ["Company Invited", "Self Enquiry"] # <- dataset text
80
- OCCUPATION_OPTS = ["Salaried", "Small Business", "Freelancer", "Large Business", "Other"]
81
  GENDER_OPTS = ["Male", "Female"]
82
- PRODUCT_PITCHED_OPTS = ["Basic", "Deluxe", "Standard", "Super Deluxe", "King"] # dataset list
83
- MARITAL_STATUS_OPTS = ["Single", "Married", "Divorced", "Unmarried"] # dataset used "Unmarried"
84
- DESIGNATION_OPTS = ["Executive", "Senior Executive", "Manager", "Senior Manager", "AVP", "VP", "Director", "Junior Executive"]
85
 
86
  with st.form("predict_form"):
87
  col1, col2 = st.columns(2)
 
88
  with col1:
89
  Age = st.number_input("Age", min_value=18, max_value=100, value=30)
90
  TypeofContact = st.selectbox("Type of Contact", TYPE_OF_CONTACT_OPTS)
@@ -110,7 +124,7 @@ with st.form("predict_form"):
110
  submitted = st.form_submit_button("Predict")
111
 
112
  if submitted:
113
- # Build UI row (exact training column names)
114
  ui_row = {
115
  "Age": Age,
116
  "TypeofContact": TypeofContact,
@@ -132,34 +146,26 @@ if submitted:
132
  "MonthlyIncome": float(MonthlyIncome),
133
  }
134
 
135
- # Fallback if EXPECTED_COLS couldn't be read for some reason
136
- base_cols = EXPECTED_COLS if EXPECTED_COLS else list(ui_row.keys())
 
 
 
 
 
 
 
137
 
138
- # Start with expected columns set to NaN, then overlay UI values
139
- template = {c: [np.nan] for c in base_cols}
140
- row = pd.DataFrame(template)
141
  for k, v in ui_row.items():
142
  if k in row.columns:
143
  row.at[0, k] = v
144
 
145
- # Coerce numerics (safeguard)
146
- numeric_cols = [
147
- "Age", "CityTier", "DurationOfPitch", "NumberOfPersonVisiting", "NumberOfFollowups",
148
- "PreferredPropertyStar", "NumberOfTrips", "Passport", "PitchSatisfactionScore",
149
- "OwnCar", "NumberOfChildrenVisiting", "MonthlyIncome",
150
- ]
151
- for c in numeric_cols:
152
- if c in row.columns:
153
- row[c] = pd.to_numeric(row[c], errors="coerce")
154
-
155
- # Optional: if your pipeline didn't add imputers, simple fill for numerics
156
- for c in numeric_cols:
157
- if c in row.columns and pd.isna(row.at[0, c]):
158
- row.at[0, c] = 0
159
-
160
  try:
161
  pred = model.predict(row)[0]
162
- proba = float(model.predict_proba(row)[0, 1]) if hasattr(model, "predict_proba") else None
 
 
163
 
164
  st.subheader("Result")
165
  if pred == 1:
@@ -168,13 +174,9 @@ if submitted:
168
  st.error(f"Not likely to purchase (confidence: {1 - proba:.2f})" if proba is not None else "Not likely to purchase")
169
 
170
  with st.expander("Inputs sent to model"):
171
- st.dataframe(row)
172
-
173
- if not EXPECTED_COLS:
174
- st.info("Note: EXPECTED_COLS could not be read from the pipeline; used UI keys as fallback.")
175
 
176
  except Exception as e:
177
  st.error(f"Prediction failed: {e}")
178
- with st.expander("Debug"):
179
- st.write("Expected columns:", EXPECTED_COLS)
180
- st.dataframe(row)
 
1
  import os
2
  import joblib
 
3
  import numpy as np
4
+ import pandas as pd
5
  import streamlit as st
6
  from huggingface_hub import hf_hub_download, login
7
 
8
+ # ──────────────────────────────────────────────────────────────────────────────
9
+ # Streamlit page config MUST be the very first Streamlit call on the page
10
+ # (use a guard so it only runs once, even on reruns).
11
+ # ──────────────────────────────────────────────────────────────────────────────
12
+ if "_page_config_set" not in st.session_state:
13
+ st.set_page_config(page_title="Tourism Wellness Package Predictor", layout="centered")
14
+ st.session_state["_page_config_set"] = True
15
+
16
+ # ──────────────────────────────────────────────────────────────────────────────
17
+ # HF Hub config & auth
18
+ # ──────────────────────────────────────────────────────────────────────────────
19
+ HF_TOKEN = os.getenv("HF_TOKEN") # optional if model repo is public
20
  HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "dhani10/tourism-model")
21
  MODEL_FILE = os.getenv("MODEL_FILE", "model/best_model.joblib")
22
 
23
+ # Writable caches on Spaces
24
  HF_CACHE_ROOT = os.getenv("HF_HOME", "/tmp/huggingface")
25
  os.environ["HF_HOME"] = HF_CACHE_ROOT
26
  os.environ["HF_HUB_CACHE"] = os.path.join(HF_CACHE_ROOT, "hub")
 
28
  for d in (HF_CACHE_ROOT, os.environ["HF_HUB_CACHE"], os.environ["TRANSFORMERS_CACHE"]):
29
  os.makedirs(d, exist_ok=True)
30
 
31
+ # Login if token present (private repos)
32
  if HF_TOKEN:
33
  try:
34
  login(token=HF_TOKEN)
35
  except Exception:
36
  pass
37
 
38
+ # ──────────────────────────────────────────────────────────────────────────────
39
+ # Load model from the Hub (cached)
40
+ # ──────────────────────────────────────────────────────────────────────────────
41
  @st.cache_resource
42
  def load_model():
43
  local_path = hf_hub_download(
 
51
 
52
  model = load_model()
53
 
54
+ # ──────────────────────────────────────────────────────────────────────────────
55
+ # Helper: get the raw input feature names the ColumnTransformer expects
56
+ # ──────────────────────────────────────────────────────────────────────────────
57
  def get_expected_input_columns(clf):
58
  pre = clf.named_steps.get("preprocessor")
59
  cols = []
60
  if pre is None:
61
  return cols
62
+ # Works both before and after fit
63
  transformers = getattr(pre, "transformers", None) or getattr(pre, "transformers_", [])
64
  for _, _, selected in transformers:
65
  if selected in (None, "drop"):
 
68
  cols.extend(selected)
69
  elif isinstance(selected, (tuple, np.ndarray, pd.Index)):
70
  cols.extend(list(selected))
71
+ # unique, preserve order
72
+ return list(dict.fromkeys(cols))
 
 
 
 
 
 
73
 
74
  EXPECTED_COLS = get_expected_input_columns(model)
75
 
76
+ # Known categorical feature names from your dataset
77
+ CAT_FEATURES = {
78
+ "TypeofContact", "Occupation", "Gender", "ProductPitched",
79
+ "MaritalStatus", "Designation"
80
+ }
81
+ # Reasonable defaults for features we don't expose explicitly
82
+ CAT_DEFAULT = "Unknown"
83
+ NUM_DEFAULT = 0
84
+
85
+ # ──────────────────────────────────────────────────────────────────────────────
86
+ # UI
87
+ # ──────────────────────────────────────────────────────────────────────────────
88
+ st.title("Tourism Wellness Package Predictor")
89
  st.caption("Fill in customer details to predict purchase likelihood.")
90
 
91
+ # Categorical options (adjust if your dataset vocabulary differs)
92
+ TYPE_OF_CONTACT_OPTS = ["Company Invited", "Self Inquiry"]
93
+ OCCUPATION_OPTS = ["Salaried", "Freelancer", "Other"]
94
  GENDER_OPTS = ["Male", "Female"]
95
+ PRODUCT_PITCHED_OPTS = ["Basic", "Deluxe", "King", "Standard", "Super Deluxe", "Elite"]
96
+ MARITAL_STATUS_OPTS = ["Single", "Married", "Divorced"]
97
+ DESIGNATION_OPTS = ["Executive", "Manager", "Senior Manager", "AVP", "VP"]
98
 
99
  with st.form("predict_form"):
100
  col1, col2 = st.columns(2)
101
+
102
  with col1:
103
  Age = st.number_input("Age", min_value=18, max_value=100, value=30)
104
  TypeofContact = st.selectbox("Type of Contact", TYPE_OF_CONTACT_OPTS)
 
124
  submitted = st.form_submit_button("Predict")
125
 
126
  if submitted:
127
+ # User-provided features
128
  ui_row = {
129
  "Age": Age,
130
  "TypeofContact": TypeofContact,
 
146
  "MonthlyIncome": float(MonthlyIncome),
147
  }
148
 
149
+ # Build a 1-row frame with EXACTLY the expected columns:
150
+ # 1) Start from defaults (avoid NaNs)
151
+ defaults = {}
152
+ for c in EXPECTED_COLS:
153
+ if c in CAT_FEATURES:
154
+ defaults[c] = CAT_DEFAULT
155
+ else:
156
+ defaults[c] = NUM_DEFAULT
157
+ row = pd.DataFrame({k: [v] for k, v in defaults.items()})
158
 
159
+ # 2) Overlay user inputs where available
 
 
160
  for k, v in ui_row.items():
161
  if k in row.columns:
162
  row.at[0, k] = v
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  try:
165
  pred = model.predict(row)[0]
166
+ proba = None
167
+ if hasattr(model, "predict_proba"):
168
+ proba = float(model.predict_proba(row)[0, 1])
169
 
170
  st.subheader("Result")
171
  if pred == 1:
 
174
  st.error(f"Not likely to purchase (confidence: {1 - proba:.2f})" if proba is not None else "Not likely to purchase")
175
 
176
  with st.expander("Inputs sent to model"):
177
+ st.write(row)
 
 
 
178
 
179
  except Exception as e:
180
  st.error(f"Prediction failed: {e}")
181
+ with st.expander("Debug: expected raw feature names"):
182
+ st.write(EXPECTED_COLS)