simnid commited on
Commit
7793b7f
·
verified ·
1 Parent(s): 38b8978

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +92 -82
  2. bulk_data_upload.py +49 -0
app.py CHANGED
@@ -1,8 +1,10 @@
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  from huggingface_hub import hf_hub_download
5
  import joblib
 
6
 
7
  # App title and description
8
  st.set_page_config(
@@ -11,13 +13,13 @@ st.set_page_config(
11
  layout="wide"
12
  )
13
 
14
- st.title("🏖️ Wellness Tourism Prediction App")
15
  st.markdown("""
16
  This application predicts whether a customer is likely to purchase a wellness tourism package
17
  based on their demographic, behavioral, and engagement data.
18
  """)
19
 
20
- # Sidebar for information
21
  with st.sidebar:
22
  st.header("About This Model")
23
  st.markdown("""
@@ -25,98 +27,81 @@ with st.sidebar:
25
  - Algorithm: XGBoost Classifier (pipeline with preprocessing)
26
  - Trained on: Wellness Tourism Dataset
27
  - Target: Product Taken (1 = Purchased, 0 = Not Purchased)
 
 
 
 
28
  """)
29
- st.subheader("Model Performance (example)")
30
- st.metric("ROC AUC", "0.94")
31
- st.metric("Precision (Class 1)", "0.69")
32
- st.metric("Recall (Class 1)", "0.79")
33
 
34
- # Function to download and load model (pipeline)
 
 
 
 
 
 
 
 
35
  @st.cache_resource
36
  def load_model():
37
- """Load the trained pipeline from Hugging Face Hub"""
38
  try:
39
  model_path = hf_hub_download(
40
- repo_id="simnid/wellness-tourism-model",
41
- filename="best_wellness_tourism_model.joblib",
42
  repo_type="model"
43
  )
44
- model = joblib.load(model_path)
45
- return model
46
  except Exception as e:
47
  st.error(f"Error loading model: {e}")
48
  return None
49
 
50
  model = load_model()
51
  if model is None:
52
- st.warning("Model could not be loaded. Please check your connection.")
53
  st.stop()
54
 
55
- # Attempt to infer expected input columns from the trained pipeline
56
- def get_expected_input_columns(model):
57
- try:
58
- # If ColumnTransformer was used as first step in pipeline with name 'preprocessor'
59
- if hasattr(model, "named_steps") and "preprocessor" in model.named_steps:
60
- pre = model.named_steps["preprocessor"]
61
- cols = []
62
- for transformer in pre.transformers_:
63
- name, trans, cols_list = transformer
64
- # cols_list may be a slice or list
65
- if isinstance(cols_list, (list, tuple)):
66
- cols.extend(list(cols_list))
67
- else:
68
- try:
69
- cols.extend(list(cols_list))
70
- except Exception:
71
- pass
72
- return cols
73
- except Exception:
74
- pass
75
- # Fallback: define expected columns explicitly
76
- return [
77
- 'Age','TypeofContact','CityTier','DurationOfPitch','Occupation','Gender',
78
- 'NumberOfPersonVisiting','NumberOfFollowups','ProductPitched','PreferredPropertyStar',
79
- 'MaritalStatus','NumberOfTrips','Passport','PitchSatisfactionScore','OwnCar',
80
- 'NumberOfChildrenVisiting','Designation','MonthlyIncome','PitchEfficiency'
81
- ]
82
-
83
- expected_cols = get_expected_input_columns(model)
84
-
85
- # User input section
86
  st.header("Customer Information")
87
  col1, col2, col3 = st.columns(3)
88
 
89
  with col1:
90
- Age = st.number_input("Age", min_value=18, max_value=80, value=35, step=1)
 
91
  Gender = st.selectbox("Gender", ["Male", "Female"])
92
  MaritalStatus = st.selectbox("Marital Status", ["Single", "Married", "Divorced", "Unmarried"])
93
- NumberOfChildrenVisiting = st.number_input("Number of Children Visiting", min_value=0, max_value=5, value=0, step=1)
94
  Designation = st.selectbox("Designation", ["Executive", "Manager", "Senior Manager", "AVP", "VP"])
95
 
96
  with col2:
 
97
  CityTier = st.selectbox("City Tier", [1, 2, 3])
98
  PreferredPropertyStar = st.selectbox("Preferred Property Star Rating", [3, 4, 5])
99
  Passport = st.selectbox("Has Passport", [0, 1], format_func=lambda x: "No" if x == 0 else "Yes")
100
  OwnCar = st.selectbox("Owns Car", [0, 1], format_func=lambda x: "No" if x == 0 else "Yes")
101
- NumberOfTrips = st.number_input("Number of Previous Trips", min_value=0, max_value=20, value=2, step=1)
102
 
103
  with col3:
 
104
  TypeofContact = st.selectbox("Type of Contact", ["Self Enquiry", "Company Invited"])
105
- DurationOfPitch = st.number_input("Duration of Pitch (minutes)", min_value=0.0, max_value=60.0, value=15.0, step=0.5)
106
- NumberOfPersonVisiting = st.number_input("Number of People Visiting", min_value=1, max_value=10, value=2, step=1)
107
- NumberOfFollowups = st.number_input("Number of Follow-ups", min_value=0, max_value=10, value=3, step=1)
108
  ProductPitched = st.selectbox("Product Pitched", ["Basic", "Deluxe", "Standard", "Super Deluxe", "King"])
109
- PitchSatisfactionScore = st.slider("Pitch Satisfaction Score", 1, 5, 3)
110
 
111
- # Financial info & derived feature
112
- PitchEfficiency = DurationOfPitch * PitchSatisfactionScore
113
- st.metric("Calculated Pitch Efficiency", f"{PitchEfficiency:.2f}")
 
 
 
114
 
115
- Occupation = st.selectbox("Occupation", ["Salaried", "Small Business", "Large Business", "Free Lancer"])
116
- MonthlyIncome = st.number_input("Monthly Income ($)", min_value=1000, max_value=50000, value=15000, step=500)
 
117
 
118
- # Assemble input as raw (strings preserved)
119
- input_row = {
120
  'Age': Age,
121
  'TypeofContact': TypeofContact,
122
  'CityTier': CityTier,
@@ -136,51 +121,76 @@ input_row = {
136
  'Designation': Designation,
137
  'MonthlyIncome': MonthlyIncome,
138
  'PitchEfficiency': PitchEfficiency
139
- }
140
-
141
- input_data = pd.DataFrame([input_row])
142
-
143
- try:
144
- cols_to_use = [c for c in expected_cols if c in input_data.columns]
145
- input_data = input_data[cols_to_use]
146
- except Exception:
147
- pass
148
 
149
  with st.expander("View Input Data"):
150
  st.dataframe(input_data)
 
 
151
 
152
- # Prediction
153
  st.header("Prediction")
154
  if st.button("Predict Purchase Probability", type="primary", use_container_width=True):
155
  with st.spinner("Making prediction..."):
156
  try:
157
- # model is a pipeline that includes preprocessing
158
  prediction_proba = model.predict_proba(input_data)[0]
159
- prediction_class = int(model.predict(input_data)[0])
160
-
161
- prob_purchase = float(prediction_proba[1] * 100)
162
- prob_no_purchase = float(prediction_proba[0] * 100)
163
 
164
  col_result1, col_result2 = st.columns(2)
165
  with col_result1:
166
  st.subheader("Prediction Result")
167
  if prediction_class == 1:
168
- st.success("**Customer is LIKELY to purchase**")
169
  st.balloons()
170
  else:
171
- st.info("**Customer is UNLIKELY to purchase**")
172
-
173
  with col_result2:
174
  st.subheader("Probability Scores")
175
- st.metric("Probability of Purchase", f"{prob_purchase:.1f}%")
176
- st.metric("Probability of No Purchase", f"{prob_no_purchase:.1f}%")
177
- st.progress(int(min(max(prob_purchase, 0), 100)))
178
- st.caption(f"Confidence: {prob_purchase:.1f}%")
179
 
180
- # Business insights...
181
  except Exception as e:
182
  st.error(f"Error making prediction: {e}")
183
 
184
- # Footer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  st.markdown("---")
186
  st.caption("Wellness Tourism Prediction Model | Built with XGBoost & Streamlit")
 
1
+ # Importing packages
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
5
  from huggingface_hub import hf_hub_download
6
  import joblib
7
+ import io
8
 
9
  # App title and description
10
  st.set_page_config(
 
13
  layout="wide"
14
  )
15
 
16
+ st.title("Wellness Tourism Prediction App")
17
  st.markdown("""
18
  This application predicts whether a customer is likely to purchase a wellness tourism package
19
  based on their demographic, behavioral, and engagement data.
20
  """)
21
 
22
+ # Sidebar
23
  with st.sidebar:
24
  st.header("About This Model")
25
  st.markdown("""
 
27
  - Algorithm: XGBoost Classifier (pipeline with preprocessing)
28
  - Trained on: Wellness Tourism Dataset
29
  - Target: Product Taken (1 = Purchased, 0 = Not Purchased)
30
+ **Key Features:**
31
+ - Handles class imbalance with scale_pos_weight
32
+ - Uses preprocessing pipeline (scaling + encoding)
33
+ - Optimized for ROC-AUC score
34
  """)
 
 
 
 
35
 
36
+ st.subheader("Model Performance")
37
+ st.metric("ROC AUC", "0.9683")
38
+ st.metric("Precision (Class 1)", "0.867")
39
+ st.metric("Recall (Class 1)", "0.818")
40
+
41
+ # Load Model
42
+ MODEL_REPO_ID = "simnid/wellness-tourism-model"
43
+ MODEL_FILENAME = "best_wellness_tourism_model.joblib"
44
+
45
  @st.cache_resource
46
  def load_model():
 
47
  try:
48
  model_path = hf_hub_download(
49
+ repo_id=MODEL_REPO_ID,
50
+ filename=MODEL_FILENAME,
51
  repo_type="model"
52
  )
53
+ return joblib.load(model_path)
 
54
  except Exception as e:
55
  st.error(f"Error loading model: {e}")
56
  return None
57
 
58
  model = load_model()
59
  if model is None:
60
+ st.warning("Model could not be loaded.")
61
  st.stop()
62
 
63
+ # --- Customer Input ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  st.header("Customer Information")
65
  col1, col2, col3 = st.columns(3)
66
 
67
  with col1:
68
+ st.subheader("Demographics")
69
+ Age = st.number_input("Age", 18, 80, 35, 1)
70
  Gender = st.selectbox("Gender", ["Male", "Female"])
71
  MaritalStatus = st.selectbox("Marital Status", ["Single", "Married", "Divorced", "Unmarried"])
72
+ NumberOfChildrenVisiting = st.number_input("Number of Children Visiting", 0, 5, 0)
73
  Designation = st.selectbox("Designation", ["Executive", "Manager", "Senior Manager", "AVP", "VP"])
74
 
75
  with col2:
76
+ st.subheader("Travel Preferences")
77
  CityTier = st.selectbox("City Tier", [1, 2, 3])
78
  PreferredPropertyStar = st.selectbox("Preferred Property Star Rating", [3, 4, 5])
79
  Passport = st.selectbox("Has Passport", [0, 1], format_func=lambda x: "No" if x == 0 else "Yes")
80
  OwnCar = st.selectbox("Owns Car", [0, 1], format_func=lambda x: "No" if x == 0 else "Yes")
81
+ NumberOfTrips = st.number_input("Number of Previous Trips", 0, 20, 2)
82
 
83
  with col3:
84
+ st.subheader("Engagement Details")
85
  TypeofContact = st.selectbox("Type of Contact", ["Self Enquiry", "Company Invited"])
86
+ DurationOfPitch = st.number_input("Duration of Pitch (minutes)", 0.0, 60.0, 15.0, 0.5)
87
+ NumberOfPersonVisiting = st.number_input("Number of People Visiting", 1, 10, 2)
88
+ NumberOfFollowups = st.number_input("Number of Follow-ups", 0, 10, 3)
89
  ProductPitched = st.selectbox("Product Pitched", ["Basic", "Deluxe", "Standard", "Super Deluxe", "King"])
90
+ PitchSatisfactionScore = st.slider("Pitch Satisfaction Score", 0.0, 5.0, 3.0, 0.1)
91
 
92
+ # Financial Information
93
+ st.subheader("Financial Information")
94
+ col4, col5 = st.columns(2)
95
+ with col4:
96
+ Occupation = st.selectbox("Occupation", ["Salaried", "Small Business", "Large Business", "Free Lancer"])
97
+ MonthlyIncome = st.number_input("Monthly Income ($)", 1000, 1000000, 15000, 500)
98
 
99
+ with col5:
100
+ PitchEfficiency = DurationOfPitch * PitchSatisfactionScore
101
+ st.metric("Calculated Pitch Efficiency", f"{PitchEfficiency:.2f}")
102
 
103
+ # Assemble Input
104
+ input_data = pd.DataFrame([{
105
  'Age': Age,
106
  'TypeofContact': TypeofContact,
107
  'CityTier': CityTier,
 
121
  'Designation': Designation,
122
  'MonthlyIncome': MonthlyIncome,
123
  'PitchEfficiency': PitchEfficiency
124
+ }])
 
 
 
 
 
 
 
 
125
 
126
  with st.expander("View Input Data"):
127
  st.dataframe(input_data)
128
+ csv = input_data.to_csv(index=False).encode('utf-8')
129
+ st.download_button("Download Input Data", csv, "input_data.csv", "text/csv")
130
 
131
+ # Prediction
132
  st.header("Prediction")
133
  if st.button("Predict Purchase Probability", type="primary", use_container_width=True):
134
  with st.spinner("Making prediction..."):
135
  try:
 
136
  prediction_proba = model.predict_proba(input_data)[0]
137
+ prediction_class = model.predict(input_data)[0]
 
 
 
138
 
139
  col_result1, col_result2 = st.columns(2)
140
  with col_result1:
141
  st.subheader("Prediction Result")
142
  if prediction_class == 1:
143
+ st.success("Customer is LIKELY to purchase")
144
  st.balloons()
145
  else:
146
+ st.info("Customer is UNLIKELY to purchase")
 
147
  with col_result2:
148
  st.subheader("Probability Scores")
149
+ st.metric("Probability of Purchase", f"{prediction_proba[1]*100:.1f}%")
150
+ st.metric("Probability of No Purchase", f"{prediction_proba[0]*100:.1f}%")
151
+ st.progress(int(prediction_proba[1]*100))
 
152
 
 
153
  except Exception as e:
154
  st.error(f"Error making prediction: {e}")
155
 
156
+ # Bulk CSV Prediction
157
+ st.header("Bulk CSV Prediction")
158
+ BULK_TEST_FILENAME = "bulk_test_sample.csv"
159
+
160
+ @st.cache_resource
161
+ def load_bulk_sample():
162
+ try:
163
+ path = hf_hub_download(
164
+ repo_id="simnid/wellness-tourism-dataset",
165
+ filename=BULK_TEST_FILENAME,
166
+ repo_type="dataset"
167
+ )
168
+ return pd.read_csv(path)
169
+ except Exception as e:
170
+ st.warning(f"Could not load bulk CSV: {e}")
171
+ return None
172
+
173
+ bulk_sample = load_bulk_sample()
174
+ uploaded_file = st.file_uploader("Upload your CSV for bulk prediction", type=["csv"])
175
+ if uploaded_file:
176
+ bulk_sample = pd.read_csv(uploaded_file)
177
+
178
+ if bulk_sample is not None:
179
+ st.write("Bulk data preview:")
180
+ st.dataframe(bulk_sample.head())
181
+ if st.button("Predict Bulk Probabilities"):
182
+ with st.spinner("Predicting..."):
183
+ try:
184
+ preds_proba = model.predict_proba(bulk_sample)
185
+ preds_class = model.predict(bulk_sample)
186
+ bulk_sample['Probability_Purchase'] = preds_proba[:,1]
187
+ bulk_sample['Prediction'] = preds_class
188
+ st.dataframe(bulk_sample)
189
+ csv_bulk = bulk_sample.to_csv(index=False).encode('utf-8')
190
+ st.download_button("Download Bulk Predictions", csv_bulk, "bulk_predictions.csv", "text/csv")
191
+ except Exception as e:
192
+ st.error(f"Error predicting bulk data: {e}")
193
+
194
+ # Footer
195
  st.markdown("---")
196
  st.caption("Wellness Tourism Prediction Model | Built with XGBoost & Streamlit")
bulk_data_upload.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+ import os
3
+ import pandas as pd
4
+
5
+ # creating bulk test data and saving locally
6
+ # Define sample bulk data
7
+ bulk_data = [
8
+ [35,"Self Enquiry",2,15.0,"Salaried","Male",2,3,"Deluxe",4,"Married",2,1,3.0,1,0,"Manager",15000,45.0],
9
+ [50,"Company Invited",3,30.0,"Large Business","Female",1,1,"Standard",5,"Single",5,1,4.5,0,1,"VP",35000,135.0],
10
+ [28,"Self Enquiry",1,10.0,"Small Business","Male",3,0,"Basic",3,"Unmarried",1,0,2.0,1,2,"Executive",12000,20.0]
11
+ ]
12
+
13
+ columns = [
14
+ 'Age','TypeofContact','CityTier','DurationOfPitch','Occupation','Gender',
15
+ 'NumberOfPersonVisiting','NumberOfFollowups','ProductPitched','PreferredPropertyStar',
16
+ 'MaritalStatus','NumberOfTrips','Passport','PitchSatisfactionScore','OwnCar',
17
+ 'NumberOfChildrenVisiting','Designation','MonthlyIncome','PitchEfficiency'
18
+ ]
19
+
20
+ df_bulk = pd.DataFrame(bulk_data, columns=columns)
21
+
22
+ # Save locally
23
+ local_path = "tourism_project/data/bulk_test_sample.csv"
24
+ df_bulk.to_csv(local_path, index=False)
25
+ print(f"Bulk CSV saved locally at {local_path}")
26
+
27
+ # Get access token from local
28
+ HF_TOKEN = os.getenv("HF_TOKEN")
29
+ if HF_TOKEN:
30
+ HF_TOKEN = HF_TOKEN.strip()
31
+ else:
32
+ raise EnvironmentError("HF_TOKEN not set!")
33
+
34
+ DATA_REPO_ID = "simnid/wellness-tourism-dataset"
35
+ BULK_CSV_PATH = "tourism_project/data/bulk_test_sample.csv"
36
+ BULK_FILENAME = "bulk_test_sample.csv"
37
+
38
+ api = HfApi(token=HF_TOKEN)
39
+
40
+ # Upload CSV
41
+ api.upload_file(
42
+ path_or_fileobj=BULK_CSV_PATH,
43
+ path_in_repo=BULK_FILENAME,
44
+ repo_id=DATA_REPO_ID,
45
+ repo_type="dataset",
46
+ token=HF_TOKEN
47
+ )
48
+
49
+ print(f"Bulk CSV uploaded to Hugging Face dataset repo: {DATA_REPO_ID}/{BULK_FILENAME}")