LalithaShiva's picture
Upload app.py with huggingface_hub
fe84eaa verified
import pandas as pd
import joblib
import os
import streamlit as st
from huggingface_hub import hf_hub_download
from sklearn.preprocessing import LabelEncoder
# Define the path to the model file within the Docker container.
# This file is copied into the container by your Dockerfile.
MODEL_PATH = "best_random_forest_model.joblib"
# Load the model directly from the local file path
@st.cache_resource # Cache the model loading
def load_model():
try:
model = joblib.load(MODEL_PATH)
st.success("Model loaded successfully!")
return model
except Exception as e:
st.error(f"Error loading model: {e}")
return None
model = load_model()
print("Model Loaded successfully")
# Streamlit UI for Tourism Sales Prediction
st.title("Tourism Sales Prediction App")
st.write("""
This application predicts whether a customer will take a tourism package based on their details.
Please enter the customer details below to get a prediction.
""")
# User input fields based on the tourism dataset columns
st.header("Customer Details")
age = st.number_input("Age", min_value=0, max_value=120, value=30)
city_tier = st.selectbox("City Tier", [1, 2, 3])
duration_of_pitch = st.number_input("Duration of Pitch (minutes)", min_value=0.0, value=10.0)
number_of_person_visiting = st.number_input("NumberOf Persons Visiting", min_value=1, value=2)
number_of_followups = st.number_input("NumberOf Followups", min_value=0, value=3)
preferred_property_star = st.selectbox("Preferred Property Star", [1.0, 2.0, 3.0, 4.0, 5.0])
number_of_trips = st.number_input("Number of Trips", min_value=0, value=1)
passport = st.selectbox("Passport", [0, 1], format_func=lambda x: "Yes" if x == 1 else "No")
pitch_satisfaction_score = st.slider("Pitch Satisfaction Score", min_value=1, max_value=5, value=3)
own_car = st.selectbox("Own Car", [0, 1], format_func=lambda x: "Yes" if x == 1 else "No")
number_of_children_visiting = st.number_input("NumberOf Children Visiting", min_value=0, value=0)
monthly_income = st.number_input("Monthly Income", min_value=0.0, value=25000.0)
# Handling categorical features - Collect user input
typeofcontact = st.selectbox("Type of Contact", ["Self Enquiry", "Company Invited"])
occupation = st.selectbox("Occupation", ["Salaried", "Small Business", "Large Business", "Free Lancer", "Other", "Housewife"])
gender = st.selectbox("Gender", ["Male", "Female", "Fe Male"]) # Note: "Fe Male" was in the original data
marital_status = st.selectbox("Marital Status", ["Married", "Single", "Divorced", "Unmarried"])
designation = st.selectbox("Designation", ["Executive", "Manager", "Senior Manager", "AVP", "VP"])
product_pitched = st.selectbox("Product Pitched", ["Basic", "Deluxe", "Standard", "Super Deluxe", "King"])
# Assemble input into DataFrame
input_data = {
'Age': age,
'TypeofContact': typeofcontact,
'CityTier': city_tier,
'DurationOfPitch': duration_of_pitch,
'Occupation': occupation,
'Gender': gender,
'NumberOfPersonVisiting': number_of_person_visiting,
'NumberOfFollowups': number_of_followups,
'ProductPitched': product_pitched,
'PreferredPropertyStar': preferred_property_star,
'MaritalStatus': marital_status,
'NumberOfTrips': number_of_trips,
'Passport': passport,
'PitchSatisfactionScore': pitch_satisfaction_score,
'OwnCar': own_car,
'NumberOfChildrenVisiting': number_of_children_visiting,
'Designation': designation,
'MonthlyIncome': monthly_income
}
input_df = pd.DataFrame([input_data])
# --- Preprocessing the input data ---
# Apply the same preprocessing steps as used during training
# This includes handling missing values and encoding categorical features.
# Assuming LabelEncoding was used for categorical features during training:
# Identify categorical columns (based on the original data types before encoding)
categorical_cols = ['TypeofContact', 'Occupation', 'Gender', 'MaritalStatus', 'Designation', 'ProductPitched']
# Apply LabelEncoding to categorical columns in the input DataFrame
# Note: You should ideally save the fitted LabelEncoders from training and load them here
# to ensure consistent encoding. For this example, we re-fit LabelEncoder, which might
# lead to issues if the input data contains categories not seen during training.
# A robust solution involves saving and loading the encoders.
# For a robust deployment, fit LabelEncoders on the training data and save/load them.
# For demonstration purposes, re-fitting here (less ideal for production):
for col in categorical_cols:
if col in input_df.columns:
# Simple imputation for missing values (if any) in categorical columns
input_df[col].fillna(input_df[col].mode()[0], inplace=True)
# Apply Label Encoding
# This assumes the LabelEncoder can handle the categories in the input.
# In a real scenario, you would use the LabelEncoder fitted on the training data.
le = LabelEncoder()
# Fit and transform for each column - this is simplified and might need adjustment
# if there are new categories in input not seen in training.
# To handle unseen categories gracefully in production, consider a different encoding strategy
# or a robust LabelEncoder implementation that handles unknown values.
# For this example, we'll assume the input categories are seen during training.
try:
# This is a simplified approach; a production system should load fitted encoders
# Create a dummy series with all possible categories seen during training
# (you would need to get this list from your training data)
# Example: all_training_categories = ['CatA', 'CatB', 'CatC', ...] # Load this list
# le.fit(all_training_categories)
# input_df[col] = le.transform(input_df[col])
# A more robust approach without saving/loading encoders for this example:
# Create a mapping based on all possible categories the Streamlit selectbox allows
# This is still not ideal if training data had other categories.
if col == 'TypeofContact':
le.fit(["Self Enquiry", "Company Invited"])
elif col == 'Occupation':
le.fit(["Salaried", "Small Business", "Large Business", "Free Lancer", "Other", "Housewife"])
elif col == 'Gender':
le.fit(["Male", "Female", "Fe Male"])
elif col == 'MaritalStatus':
le.fit(["Married", "Single", "Divorced", "Unmarried"])
elif col == 'Designation':
le.fit(["Executive", "Manager", "Senior Manager", "AVP", "VP"])
elif col == 'ProductPitched':
le.fit(["Basic", "Deluxe", "Standard", "Super Deluxe", "King"])
input_df[col] = le.transform(input_df[col])
except Exception as e:
st.error(f"Error during Label Encoding for column {col}: {e}")
# Handle error, e.g., if an unseen category is encountered
# Handle numerical features missing values (if any)
for col in input_df.columns:
if input_df[col].dtype != 'object':
input_df[col].fillna(input_df[col].median(), inplace=True)
# Ensure the columns of the input DataFrame match the columns of the training data (X_train)
# This is crucial for the model to make predictions. The order and names must match.
# You should ideally save the list of column names from your training features (X_train.columns)
# and use it here to reindex the input_df.
# For this example, let's assume the expected order based on the original training data features
expected_columns = [
'Age', 'TypeofContact', 'CityTier', 'DurationOfPitch', 'Occupation',
'Gender', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'ProductPitched',
'PreferredPropertyStar', 'MaritalStatus', 'NumberOfTrips', 'Passport',
'PitchSatisfactionScore', 'OwnCar', 'NumberOfChildrenVisiting', 'Designation',
'MonthlyIncome'
]
if set(input_df.columns) == set(expected_columns):
input_df = input_df[expected_columns] # Reorder columns
else:
st.error("Input features do not match expected features. Please check preprocessing.")
st.stop()
# --- Prediction ---
if st.button("Predict Tourism Package Purchase"):
if model is not None and input_df is not None:
try:
# Ensure column order matches training data before predicting
# This is critical if using scikit-learn models.
# The reindexing step above should ensure the order is correct if expected_columns is accurate.
prediction = model.predict(input_df)[0]
result = "Customer will likely purchase a tourism package" if prediction == 1 else "Customer will likely NOT purchase a tourism package"
st.subheader("Prediction Result:")
st.success(f"The model predicts: **{result}**")
except Exception as e:
st.error(f"Error during prediction: {e}")
# Print more details about the error for debugging
import traceback
st.error(traceback.format_exc())
else:
st.warning("Model not loaded or input data not processed correctly. Cannot make prediction.")