hbcp / app.py
omm7's picture
Upload folder using huggingface_hub
b7d40ec verified
raw
history blame
8.36 kB
import streamlit as st
import joblib
import pandas as pd
import numpy as np
import os
import time
# --- Constants and Configuration ---
MODEL_FILE = 'hotel_cancellation_prediction_model_v1_0.joblib'
# The exact list of features (columns) the model expects in this specific order
# Corrected order based on the debugging output and X_train columns
EXPECTED_FEATURES = [
'lead_time',
'no_of_special_requests',
'avg_price_per_room',
'no_of_adults',
'no_of_weekend_nights',
'required_car_parking_space', # Corrected position
'no_of_week_nights', # Corrected position
'arrival_month', # Corrected position
'market_segment_type_Online' # Corrected position
]
# Define typical ranges based on EDA (adjust as needed)
# From data.describe().T:
# lead_time: min=0, max=443, mean=81.61, 75%=118
# no_of_special_requests: min=0, max=5, mean=0.56, 75%=1
# avg_price_per_room: min=0, max=540, mean=101.03, 75%=119
# no_of_adults: min=0, max=4, mean=1.83, 75%=2
# no_of_weekend_nights: min=0, max=6, mean=0.78, 75%=1
# no_of_week_nights: min=0, max=17, mean=2.17, 75%=3
# required_car_parking_space: min=0, max=1
# arrival_month: min=1, max=12 (based on code)
FEATURE_RANGES = {
'lead_time': {'min': 0, 'max': 450, 'default': 82},
'no_of_special_requests': {'min': 0, 'max': 5, 'default': 1},
'avg_price_per_room': {'min': 0.0, 'max': 600.0, 'default': 101.0},
'no_of_adults': {'min': 0, 'max': 4, 'default': 2},
'no_of_weekend_nights': {'min': 0, 'max': 7, 'default': 1}, # Adjusted max slightly
'no_of_week_nights': {'min': 0, 'max': 20, 'default': 2}, # Adjusted max slightly
'arrival_month': {'min': 1, 'max': 12, 'default': 7}, # Default to July based on EDA for example
}
# --- Model Loading (Cached) ---
@st.cache_resource
def load_cancellation_model():
try:
model = joblib.load(MODEL_FILE)
return model
except Exception as e:
st.error(f"Error loading model: {e}")
return None
cancellation_predictor = load_cancellation_model()
# --- Prediction Function (Critical Data Preprocessing) ---
def run_prediction(
lead_time, market_segment_type, avg_price_per_room, no_of_adults,
no_of_weekend_nights, no_of_week_nights, no_of_special_requests,
arrival_month, required_car_parking_space, model
):
"""Processes inputs and runs prediction, mimicking the training feature engineering."""
# 1. Manually construct the input row, using the required feature engineering
data_row = {
'lead_time': lead_time,
'no_of_special_requests': no_of_special_requests,
'avg_price_per_room': avg_price_per_room,
'no_of_adults': no_of_adults,
'no_of_weekend_nights': no_of_weekend_nights,
'no_of_week_nights': no_of_week_nights,
'arrival_month': arrival_month,
# One-Hot Encoding for 'market_segment_type' (assuming 'Offline' is the base category)
'market_segment_type_Online': 1.0 if market_segment_type == 'Online' else 0.0,
# Binary Encoding for 'required_car_parking_space'
'required_car_parking_space': 1.0 if required_car_parking_space == "Yes" else 0.0,
}
# 2. Convert dictionary to DataFrame with the correct EXPECTED_FEATURES order and dtypes
# Explicitly set the order of columns to match EXPECTED_FEATURES
input_df = pd.DataFrame([data_row], columns=EXPECTED_FEATURES)
# Ensure 'required_car_parking_space' is float64 as expected by the model
input_df['required_car_parking_space'] = input_df['required_car_parking_space'].astype('float64')
# 3. Make Prediction
prediction = model.predict(input_df)[0]
# Probabilities are [Prob_Not_Canceled (0), Prob_Canceled (1)]
probabilities = model.predict_proba(input_df)[0]
prob_cancellation = probabilities[1]
prob_kept = probabilities[0]
return prediction, prob_cancellation, prob_kept
# --- Streamlit UI ---
st.set_page_config(
page_title="Hotel Cancellation Predictor",
layout="centered",
initial_sidebar_state="expanded"
)
st.title("🛎️ INN Hotels: Booking Cancellation Predictor")
st.markdown("Use the controls below to input booking details and predict the cancellation risk.")
if cancellation_predictor is None:
st.warning("Application stopped due to critical error in model loading.")
st.stop()
# --- Input Fields (arranged for better dashboard look) ---
col1, col2 = st.columns(2)
with col1:
lead_time = st.slider("1. Lead Time (Days before arrival)",
min_value=FEATURE_RANGES['lead_time']['min'],
max_value=FEATURE_RANGES['lead_time']['max'],
value=FEATURE_RANGES['lead_time']['default'])
no_of_adults = st.number_input("4. Number of Adults",
min_value=FEATURE_RANGES['no_of_adults']['min'],
max_value=FEATURE_RANGES['no_of_adults']['max'],
value=FEATURE_RANGES['no_of_adults']['default'],
step=1)
no_of_week_nights = st.slider("6. Number of Week Nights",
min_value=FEATURE_RANGES['no_of_week_nights']['min'],
max_value=FEATURE_RANGES['no_of_week_nights']['max'],
value=FEATURE_RANGES['no_of_week_nights']['default'])
arrival_month = st.selectbox("8. Arrival Month (1=Jan to 12=Dec)",
list(range(FEATURE_RANGES['arrival_month']['min'], FEATURE_RANGES['arrival_month']['max'] + 1)),
index=FEATURE_RANGES['arrival_month']['default'] - 1) # Adjust index for 0-based list
with col2:
market_segment_type = st.selectbox("2. Market Segment Type", ["Online", "Offline"], index=0)
avg_price_per_room = st.number_input("3. Average Price per Room ($)",
min_value=FEATURE_RANGES['avg_price_per_room']['min'],
max_value=FEATURE_RANGES['avg_price_per_room']['max'],
value=FEATURE_RANGES['avg_price_per_room']['default'],
format="%.2f")
no_of_weekend_nights = st.slider("5. Number of Weekend Nights",
min_value=FEATURE_RANGES['no_of_weekend_nights']['min'],
max_value=FEATURE_RANGES['no_of_weekend_nights']['max'],
value=FEATURE_RANGES['no_of_weekend_nights']['default'])
no_of_special_requests = st.number_input("7. Number of Special Requests",
min_value=FEATURE_RANGES['no_of_special_requests']['min'],
max_value=FEATURE_RANGES['no_of_special_requests']['max'],
value=FEATURE_RANGES['no_of_special_requests']['default'],
step=1)
required_car_parking_space = st.selectbox("9. Required Car Parking Space", ["Yes", "No"], index=1)
# --- Prediction Button ---
st.markdown("---") # Add a separator
if st.button("PREDICT CANCELLATION RISK", type="primary"):
# Simple progress indicator for UX
with st.spinner('Analyzing booking data...'):
time.sleep(0.5)
prediction, prob_cancellation, prob_kept = run_prediction(
lead_time, market_segment_type, avg_price_per_room, no_of_adults,
no_of_weekend_nights, no_of_week_nights, no_of_special_requests,
arrival_month, required_car_parking_space, cancellation_predictor
)
st.markdown("---")
st.subheader("Prediction Result")
if prediction == 1:
st.error(f"**High Risk of Cancellation:** The model predicts the booking will be **CANCELLED**.")
else:
st.success(f"**Low Risk:** The model predicts the booking will be **KEPT**.")
st.markdown(f"**Likelihood of Cancellation: {prob_cancellation*100:.2f}%**")
st.markdown(f"Likelihood of Keeping Booking: {prob_kept*100:.2f}%")
if prediction == 1 and prob_cancellation > 0.70:
st.info("💡 **Actionable Insight:** Consider proactively contacting this guest or flagging the room for immediate re-marketing.")