Spaces:

simnid
/

Wellness-Tourism-Prediction

Sleeping

App Files Files Community

simnid commited on Dec 6, 2025

Commit

7793b7f

verified ·

1 Parent(s): 38b8978

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +92 -82
bulk_data_upload.py +49 -0

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
 from huggingface_hub import hf_hub_download
 import joblib
 # App title and description
 st.set_page_config(
@@ -11,13 +13,13 @@ st.set_page_config(
     layout="wide"
 )
-st.title("🏖️ Wellness Tourism Prediction App")
 st.markdown("""
 This application predicts whether a customer is likely to purchase a wellness tourism package
 based on their demographic, behavioral, and engagement data.
 """)
-# Sidebar for information
 with st.sidebar:
     st.header("About This Model")
     st.markdown("""
@@ -25,98 +27,81 @@ with st.sidebar:
     - Algorithm: XGBoost Classifier (pipeline with preprocessing)
     - Trained on: Wellness Tourism Dataset
     - Target: Product Taken (1 = Purchased, 0 = Not Purchased)
     """)
-    st.subheader("Model Performance (example)")
-    st.metric("ROC AUC", "0.94")
-    st.metric("Precision (Class 1)", "0.69")
-    st.metric("Recall (Class 1)", "0.79")
-# Function to download and load model (pipeline)
 @st.cache_resource
 def load_model():
-    """Load the trained pipeline from Hugging Face Hub"""
     try:
         model_path = hf_hub_download(
-            repo_id="simnid/wellness-tourism-model",
-            filename="best_wellness_tourism_model.joblib",
             repo_type="model"
         )
-        model = joblib.load(model_path)
-        return model
     except Exception as e:
         st.error(f"Error loading model: {e}")
         return None
 model = load_model()
 if model is None:
-    st.warning("Model could not be loaded. Please check your connection.")
     st.stop()
-# Attempt to infer expected input columns from the trained pipeline
-def get_expected_input_columns(model):
-    try:
-        # If ColumnTransformer was used as first step in pipeline with name 'preprocessor'
-        if hasattr(model, "named_steps") and "preprocessor" in model.named_steps:
-            pre = model.named_steps["preprocessor"]
-            cols = []
-            for transformer in pre.transformers_:
-                name, trans, cols_list = transformer
-                # cols_list may be a slice or list
-                if isinstance(cols_list, (list, tuple)):
-                    cols.extend(list(cols_list))
-                else:
-                    try:
-                        cols.extend(list(cols_list))
-                    except Exception:
-                        pass
-            return cols
-    except Exception:
-        pass
-    # Fallback: define expected columns explicitly
-    return [
-        'Age','TypeofContact','CityTier','DurationOfPitch','Occupation','Gender',
-        'NumberOfPersonVisiting','NumberOfFollowups','ProductPitched','PreferredPropertyStar',
-        'MaritalStatus','NumberOfTrips','Passport','PitchSatisfactionScore','OwnCar',
-        'NumberOfChildrenVisiting','Designation','MonthlyIncome','PitchEfficiency'
-    ]
-expected_cols = get_expected_input_columns(model)
-# User input section
 st.header("Customer Information")
 col1, col2, col3 = st.columns(3)
 with col1:
-    Age = st.number_input("Age", min_value=18, max_value=80, value=35, step=1)
     Gender = st.selectbox("Gender", ["Male", "Female"])
     MaritalStatus = st.selectbox("Marital Status", ["Single", "Married", "Divorced", "Unmarried"])
-    NumberOfChildrenVisiting = st.number_input("Number of Children Visiting", min_value=0, max_value=5, value=0, step=1)
     Designation = st.selectbox("Designation", ["Executive", "Manager", "Senior Manager", "AVP", "VP"])
 with col2:
     CityTier = st.selectbox("City Tier", [1, 2, 3])
     PreferredPropertyStar = st.selectbox("Preferred Property Star Rating", [3, 4, 5])
     Passport = st.selectbox("Has Passport", [0, 1], format_func=lambda x: "No" if x == 0 else "Yes")
     OwnCar = st.selectbox("Owns Car", [0, 1], format_func=lambda x: "No" if x == 0 else "Yes")
-    NumberOfTrips = st.number_input("Number of Previous Trips", min_value=0, max_value=20, value=2, step=1)
 with col3:
     TypeofContact = st.selectbox("Type of Contact", ["Self Enquiry", "Company Invited"])
-    DurationOfPitch = st.number_input("Duration of Pitch (minutes)", min_value=0.0, max_value=60.0, value=15.0, step=0.5)
-    NumberOfPersonVisiting = st.number_input("Number of People Visiting", min_value=1, max_value=10, value=2, step=1)
-    NumberOfFollowups = st.number_input("Number of Follow-ups", min_value=0, max_value=10, value=3, step=1)
     ProductPitched = st.selectbox("Product Pitched", ["Basic", "Deluxe", "Standard", "Super Deluxe", "King"])
-    PitchSatisfactionScore = st.slider("Pitch Satisfaction Score", 1, 5, 3)
-# Financial info & derived feature
-PitchEfficiency = DurationOfPitch * PitchSatisfactionScore
-st.metric("Calculated Pitch Efficiency", f"{PitchEfficiency:.2f}")
-Occupation = st.selectbox("Occupation", ["Salaried", "Small Business", "Large Business", "Free Lancer"])
-MonthlyIncome = st.number_input("Monthly Income ($)", min_value=1000, max_value=50000, value=15000, step=500)
-# Assemble input as raw (strings preserved)
-input_row = {
     'Age': Age,
     'TypeofContact': TypeofContact,
     'CityTier': CityTier,
@@ -136,51 +121,76 @@ input_row = {
     'Designation': Designation,
     'MonthlyIncome': MonthlyIncome,
     'PitchEfficiency': PitchEfficiency
-}
-input_data = pd.DataFrame([input_row])
-try:
-    cols_to_use = [c for c in expected_cols if c in input_data.columns]
-    input_data = input_data[cols_to_use]
-except Exception:
-    pass
 with st.expander("View Input Data"):
     st.dataframe(input_data)
-# Prediction
 st.header("Prediction")
 if st.button("Predict Purchase Probability", type="primary", use_container_width=True):
     with st.spinner("Making prediction..."):
         try:
-            # model is a pipeline that includes preprocessing
             prediction_proba = model.predict_proba(input_data)[0]
-            prediction_class = int(model.predict(input_data)[0])
-            prob_purchase = float(prediction_proba[1] * 100)
-            prob_no_purchase = float(prediction_proba[0] * 100)
             col_result1, col_result2 = st.columns(2)
             with col_result1:
                 st.subheader("Prediction Result")
                 if prediction_class == 1:
-                    st.success("**Customer is LIKELY to purchase**")
                     st.balloons()
                 else:
-                    st.info("**Customer is UNLIKELY to purchase**")
             with col_result2:
                 st.subheader("Probability Scores")
-                st.metric("Probability of Purchase", f"{prob_purchase:.1f}%")
-                st.metric("Probability of No Purchase", f"{prob_no_purchase:.1f}%")
-                st.progress(int(min(max(prob_purchase, 0), 100)))
-                st.caption(f"Confidence: {prob_purchase:.1f}%")
-            # Business insights...
         except Exception as e:
             st.error(f"Error making prediction: {e}")
-# Footer
 st.markdown("---")
 st.caption("Wellness Tourism Prediction Model | Built with XGBoost & Streamlit")

+# Importing packages
 import streamlit as st
 import pandas as pd
 import numpy as np
 from huggingface_hub import hf_hub_download
 import joblib
+import io
 # App title and description
 st.set_page_config(
     layout="wide"
 )
+st.title("Wellness Tourism Prediction App")
 st.markdown("""
 This application predicts whether a customer is likely to purchase a wellness tourism package
 based on their demographic, behavioral, and engagement data.
 """)
+# Sidebar
 with st.sidebar:
     st.header("About This Model")
     st.markdown("""
     - Algorithm: XGBoost Classifier (pipeline with preprocessing)
     - Trained on: Wellness Tourism Dataset
     - Target: Product Taken (1 = Purchased, 0 = Not Purchased)
+    **Key Features:**
+    - Handles class imbalance with scale_pos_weight
+    - Uses preprocessing pipeline (scaling + encoding)
+    - Optimized for ROC-AUC score
     """)
+    st.subheader("Model Performance")
+    st.metric("ROC AUC", "0.9683")
+    st.metric("Precision (Class 1)", "0.867")
+    st.metric("Recall (Class 1)", "0.818")
+# Load Model
+MODEL_REPO_ID = "simnid/wellness-tourism-model"
+MODEL_FILENAME = "best_wellness_tourism_model.joblib"
 @st.cache_resource
 def load_model():
     try:
         model_path = hf_hub_download(
+            repo_id=MODEL_REPO_ID,
+            filename=MODEL_FILENAME,
             repo_type="model"
         )
+        return joblib.load(model_path)
     except Exception as e:
         st.error(f"Error loading model: {e}")
         return None
 model = load_model()
 if model is None:
+    st.warning("Model could not be loaded.")
     st.stop()
+# --- Customer Input ---
 st.header("Customer Information")
 col1, col2, col3 = st.columns(3)
 with col1:
+    st.subheader("Demographics")
+    Age = st.number_input("Age", 18, 80, 35, 1)
     Gender = st.selectbox("Gender", ["Male", "Female"])
     MaritalStatus = st.selectbox("Marital Status", ["Single", "Married", "Divorced", "Unmarried"])
+    NumberOfChildrenVisiting = st.number_input("Number of Children Visiting", 0, 5, 0)
     Designation = st.selectbox("Designation", ["Executive", "Manager", "Senior Manager", "AVP", "VP"])
 with col2:
+    st.subheader("Travel Preferences")
     CityTier = st.selectbox("City Tier", [1, 2, 3])
     PreferredPropertyStar = st.selectbox("Preferred Property Star Rating", [3, 4, 5])
     Passport = st.selectbox("Has Passport", [0, 1], format_func=lambda x: "No" if x == 0 else "Yes")
     OwnCar = st.selectbox("Owns Car", [0, 1], format_func=lambda x: "No" if x == 0 else "Yes")
+    NumberOfTrips = st.number_input("Number of Previous Trips", 0, 20, 2)
 with col3:
+    st.subheader("Engagement Details")
     TypeofContact = st.selectbox("Type of Contact", ["Self Enquiry", "Company Invited"])
+    DurationOfPitch = st.number_input("Duration of Pitch (minutes)", 0.0, 60.0, 15.0, 0.5)
+    NumberOfPersonVisiting = st.number_input("Number of People Visiting", 1, 10, 2)
+    NumberOfFollowups = st.number_input("Number of Follow-ups", 0, 10, 3)
     ProductPitched = st.selectbox("Product Pitched", ["Basic", "Deluxe", "Standard", "Super Deluxe", "King"])
+    PitchSatisfactionScore = st.slider("Pitch Satisfaction Score", 0.0, 5.0, 3.0, 0.1)
+# Financial Information
+st.subheader("Financial Information")
+col4, col5 = st.columns(2)
+with col4:
+    Occupation = st.selectbox("Occupation", ["Salaried", "Small Business", "Large Business", "Free Lancer"])
+    MonthlyIncome = st.number_input("Monthly Income ($)", 1000, 1000000, 15000, 500)
+with col5:
+    PitchEfficiency = DurationOfPitch * PitchSatisfactionScore
+    st.metric("Calculated Pitch Efficiency", f"{PitchEfficiency:.2f}")
+# Assemble Input
+input_data = pd.DataFrame([{
     'Age': Age,
     'TypeofContact': TypeofContact,
     'CityTier': CityTier,
     'Designation': Designation,
     'MonthlyIncome': MonthlyIncome,
     'PitchEfficiency': PitchEfficiency
+}])
 with st.expander("View Input Data"):
     st.dataframe(input_data)
+    csv = input_data.to_csv(index=False).encode('utf-8')
+    st.download_button("Download Input Data", csv, "input_data.csv", "text/csv")
+# Prediction
 st.header("Prediction")
 if st.button("Predict Purchase Probability", type="primary", use_container_width=True):
     with st.spinner("Making prediction..."):
         try:
             prediction_proba = model.predict_proba(input_data)[0]
+            prediction_class = model.predict(input_data)[0]
             col_result1, col_result2 = st.columns(2)
             with col_result1:
                 st.subheader("Prediction Result")
                 if prediction_class == 1:
+                    st.success("Customer is LIKELY to purchase")
                     st.balloons()
                 else:
+                    st.info("Customer is UNLIKELY to purchase")
             with col_result2:
                 st.subheader("Probability Scores")
+                st.metric("Probability of Purchase", f"{prediction_proba[1]*100:.1f}%")
+                st.metric("Probability of No Purchase", f"{prediction_proba[0]*100:.1f}%")
+                st.progress(int(prediction_proba[1]*100))
         except Exception as e:
             st.error(f"Error making prediction: {e}")
+# Bulk CSV Prediction
+st.header("Bulk CSV Prediction")
+BULK_TEST_FILENAME = "bulk_test_sample.csv"
+@st.cache_resource
+def load_bulk_sample():
+    try:
+        path = hf_hub_download(
+            repo_id="simnid/wellness-tourism-dataset",
+            filename=BULK_TEST_FILENAME,
+            repo_type="dataset"
+        )
+        return pd.read_csv(path)
+    except Exception as e:
+        st.warning(f"Could not load bulk CSV: {e}")
+        return None
+bulk_sample = load_bulk_sample()
+uploaded_file = st.file_uploader("Upload your CSV for bulk prediction", type=["csv"])
+if uploaded_file:
+    bulk_sample = pd.read_csv(uploaded_file)
+if bulk_sample is not None:
+    st.write("Bulk data preview:")
+    st.dataframe(bulk_sample.head())
+    if st.button("Predict Bulk Probabilities"):
+        with st.spinner("Predicting..."):
+            try:
+                preds_proba = model.predict_proba(bulk_sample)
+                preds_class = model.predict(bulk_sample)
+                bulk_sample['Probability_Purchase'] = preds_proba[:,1]
+                bulk_sample['Prediction'] = preds_class
+                st.dataframe(bulk_sample)
+                csv_bulk = bulk_sample.to_csv(index=False).encode('utf-8')
+                st.download_button("Download Bulk Predictions", csv_bulk, "bulk_predictions.csv", "text/csv")
+            except Exception as e:
+                st.error(f"Error predicting bulk data: {e}")
+# Footer
 st.markdown("---")
 st.caption("Wellness Tourism Prediction Model | Built with XGBoost & Streamlit")

bulk_data_upload.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from huggingface_hub import HfApi
+import os
+import pandas as pd
+# creating bulk test data and saving locally
+# Define sample bulk data
+bulk_data = [
+    [35,"Self Enquiry",2,15.0,"Salaried","Male",2,3,"Deluxe",4,"Married",2,1,3.0,1,0,"Manager",15000,45.0],
+    [50,"Company Invited",3,30.0,"Large Business","Female",1,1,"Standard",5,"Single",5,1,4.5,0,1,"VP",35000,135.0],
+    [28,"Self Enquiry",1,10.0,"Small Business","Male",3,0,"Basic",3,"Unmarried",1,0,2.0,1,2,"Executive",12000,20.0]
+]
+columns = [
+    'Age','TypeofContact','CityTier','DurationOfPitch','Occupation','Gender',
+    'NumberOfPersonVisiting','NumberOfFollowups','ProductPitched','PreferredPropertyStar',
+    'MaritalStatus','NumberOfTrips','Passport','PitchSatisfactionScore','OwnCar',
+    'NumberOfChildrenVisiting','Designation','MonthlyIncome','PitchEfficiency'
+]
+df_bulk = pd.DataFrame(bulk_data, columns=columns)
+# Save locally
+local_path = "tourism_project/data/bulk_test_sample.csv"
+df_bulk.to_csv(local_path, index=False)
+print(f"Bulk CSV saved locally at {local_path}")
+# Get access token from local
+HF_TOKEN = os.getenv("HF_TOKEN")
+if HF_TOKEN:
+    HF_TOKEN = HF_TOKEN.strip()
+else:
+    raise EnvironmentError("HF_TOKEN not set!")
+DATA_REPO_ID = "simnid/wellness-tourism-dataset"
+BULK_CSV_PATH = "tourism_project/data/bulk_test_sample.csv"
+BULK_FILENAME = "bulk_test_sample.csv"
+api = HfApi(token=HF_TOKEN)
+# Upload CSV
+api.upload_file(
+    path_or_fileobj=BULK_CSV_PATH,
+    path_in_repo=BULK_FILENAME,
+    repo_id=DATA_REPO_ID,
+    repo_type="dataset",
+    token=HF_TOKEN
+)
+print(f"Bulk CSV uploaded to Hugging Face dataset repo: {DATA_REPO_ID}/{BULK_FILENAME}")