Siep3 / src /streamlit_app.py
Pegumenezes's picture
Update src/streamlit_app.py
652ba8c verified
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
st.set_page_config(layout="wide")
st.title("Hotel Booking Cancellation Prediction Dashboard")
@st.cache_data
def load_data():
"""
Minha função para carregar os dados. Ela pega o CSV direto do meu repositório no GitHub
e aplica a limpeza inicial que eu defini como a mais adequada para o projeto.
"""
# 1. Defini a URL direta para o meu arquivo CSV no GitHub.
# Tive o cuidado de pegar o link para o arquivo "raw" (bruto), que é o que o pandas precisa.
url = "https://raw.githubusercontent.com/pegumzs/Siep/main/hotel_bookings.csv"
# 2. Usei o pandas para ler os dados da URL. Um try-except simples para garantir que a conexão funcione.
try:
df = pd.read_csv(url)
except Exception as e:
st.error(f"Não foi possível carregar os dados do GitHub. Erro: {e}")
return None
# 3. Defini uma lista de colunas para remover. Baseei essa decisão em algumas premissas:
# - Colunas como 'company' e 'agent' tinham muitos dados nulos.
# - Colunas como 'reservation_status' e 'reservation_status_date' são "vazamento de dados" (data leakage),
# pois elas contêm a resposta que eu quero prever. Removê-las foi essencial para um modelo válido.
# - Outras colunas, como as de data, eu decidi não usar para manter o modelo mais simples nesta fase.
cols_to_drop = [
"company", "agent", "reservation_status_date", "arrival_date_week_number",
"arrival_date_year", "assigned_room_type", "reserved_room_type",
"days_in_waiting_list", "reservation_status", "country"
]
df.drop(columns=cols_to_drop, errors="ignore", inplace=True)
# 4. Optei por uma estratégia de limpeza direta: remover qualquer linha que ainda tivesse dados faltantes.
# Isso garante que meu modelo só vai treinar com registros 100% completos.
df.dropna(inplace=True)
# 5. Notei que a coluna 'adr' (preço) podia ter valores não numéricos.
# Aqui eu forço a conversão para número e removo as linhas onde isso não for possível.
df['adr'] = pd.to_numeric(df['adr'], errors='coerce')
df.dropna(subset=['adr'], inplace=True)
return df
def train_model(df):
# Select features (independent variables) and target variable
continuous_features = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights',
'booking_changes', 'adr', 'total_of_special_requests']
categorical_features = ['hotel', 'market_segment', 'deposit_type',
'customer_type', 'arrival_date_month']
selected_features = continuous_features + categorical_features
df_processed = pd.get_dummies(df[selected_features], columns=categorical_features, drop_first=True)
df_processed['is_canceled'] = df['is_canceled']
X = df_processed.drop('is_canceled', axis=1)
y = df_processed['is_canceled']
# RFE Feature Selection
model_rfe_estimator = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000)
num_features_to_select = 15
rfe = RFE(estimator=model_rfe_estimator, n_features_to_select=num_features_to_select)
rfe.fit(X, y)
selected_rfe_features = X.columns[rfe.support_]
X_rfe = X[selected_rfe_features]
# Split data
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = train_test_split(X_rfe, y, test_size=0.3, random_state=42)
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_rfe, y_train_rfe)
# Add constant for statsmodels
X_train_resampled_sm = sm.add_constant(X_train_resampled)
X_test_rfe_sm = sm.add_constant(X_test_rfe)
# Ensure all columns are numeric and align columns
for col in X_train_resampled_sm.columns:
X_train_resampled_sm[col] = pd.to_numeric(X_train_resampled_sm[col], errors='coerce')
for col in X_test_rfe_sm.columns:
X_test_rfe_sm[col] = pd.to_numeric(X_test_rfe_sm[col], errors='coerce')
X_train_resampled_sm.dropna(axis=1, inplace=True)
X_test_rfe_sm.dropna(axis=1, inplace=True)
common_cols = list(set(X_train_resampled_sm.columns) & set(X_test_rfe_sm.columns))
X_train_resampled_sm = X_train_resampled_sm[common_cols]
X_test_rfe_sm = X_test_rfe_sm[common_cols]
X_test_rfe_sm = X_test_rfe_sm[X_train_resampled_sm.columns] # Reorder columns
# Train Statsmodels Logistic Regression
logit_model = sm.Logit(y_train_resampled, X_train_resampled_sm)
result = logit_model.fit()
return result, X_test_rfe_sm, y_test_rfe, X_rfe, y, selected_rfe_features
# Main application logic
df = load_data()
result, X_test_rfe_sm, y_test_rfe, X_rfe, y, selected_rfe_features = train_model(df)
# --- Display Model Summary and Coefficients ---
st.header("Model Interpretation: Coefficients, Odds Ratios, and P-values")
coefficients = pd.DataFrame({
'Feature': result.params.index,
'Log-Odds': result.params.values,
'Odds Ratio': np.exp(result.params.values),
'P-value': result.pvalues.values
})
coefficients = coefficients.sort_values(by='Odds Ratio', ascending=False)
st.dataframe(coefficients)
# --- Multicollinearity Check (VIF) ---
st.header("Multicollinearity Check (VIF)")
vif_data = pd.DataFrame()
vif_data["feature"] = X_test_rfe_sm.columns
vif_data["VIF"] = [variance_inflation_factor(X_test_rfe_sm.values, i)
for i in range(X_test_rfe_sm.shape[1])]
vif_data = vif_data.sort_values(by="VIF", ascending=False)
st.dataframe(vif_data)
# --- Model Evaluation ---
st.header("Model Evaluation")
y_pred_smote_prob = result.predict(X_test_rfe_sm)
y_pred_smote = (y_pred_smote_prob >= 0.5).astype(int)
st.subheader("Accuracy")
st.write(f"Model Accuracy: {accuracy_score(y_test_rfe, y_pred_smote):.4f}")
st.subheader("Confusion Matrix")
cm = confusion_matrix(y_test_rfe, y_pred_smote)
st.write(cm)
st.subheader("Classification Report")
st.text(classification_report(y_test_rfe, y_pred_smote))
# --- ROC Curve and AUC ---
st.header("ROC Curve and AUC")
fpr, tpr, thresholds = roc_curve(y_test_rfe, y_pred_smote_prob)
roc_auc = auc(fpr, tpr)
fig_roc, ax_roc = plt.subplots(figsize=(8, 6))
ax_roc.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
ax_roc.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
ax_roc.set_xlim([0.0, 1.0])
ax_roc.set_ylim([0.0, 1.05])
ax_roc.set_xlabel('False Positive Rate')
ax_roc.set_ylabel('True Positive Rate')
ax_roc.set_title('Receiver Operating Characteristic (ROC) Curve')
ax_roc.legend(loc="lower right")
st.pyplot(fig_roc)
# --- Logistic Curves for Selected Continuous Variables ---
st.header("Logistic Curves for Selected Continuous Variables")
def plot_logistic_curve(feature_name, model_results, X_data, y_data, selected_features):
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(x=X_data[feature_name], y=y_data, alpha=0.1, label='Actual Cancellation (0/1)', ax=ax)
x_range = np.linspace(X_data[feature_name].min(), X_data[feature_name].max(), 500)
# Create a DataFrame for prediction that matches the model's features, with other variables at their means
# Ensure 'const' is also present if it was in the training data
if 'const' in model_results.params.index:
mean_values = X_data.drop(columns=[feature_name], errors='ignore').mean().to_dict()
predictions_df = pd.DataFrame([mean_values] * len(x_range), columns=X_data.drop(columns=[feature_name], errors='ignore').columns)
predictions_df[feature_name] = x_range
predictions_df = sm.add_constant(predictions_df, prepend=True) # Add constant first
else:
mean_values = X_data.drop(columns=[feature_name], errors='ignore').mean().to_dict()
predictions_df = pd.DataFrame([mean_values] * len(x_range), columns=X_data.drop(columns=[feature_name], errors='ignore').columns)
predictions_df[feature_name] = x_range
# Ensure the order of columns in predictions_df matches the model's expected order
# This is crucial because `result.predict` relies on the order of features
predictions_df = predictions_df[model_results.params.index.drop('const', errors='ignore')]
# Add constant back if it was in the original model
if 'const' in model_results.params.index:
predictions_df = sm.add_constant(predictions_df, prepend=True)
y_proba = model_results.predict(predictions_df)
ax.plot(x_range, y_proba, color='red', lw=2, label='Predicted Probability of Cancellation')
ax.set_title(f'Logistic Curve for {feature_name}')
ax.set_xlabel(feature_name)
ax.set_ylabel('Probability of Cancellation')
ax.legend()
ax.grid(True)
st.pyplot(fig)
continuous_features_for_plot = [f for f in selected_rfe_features if f in ['lead_time', 'adr', 'booking_changes', 'stays_in_week_nights', 'total_of_special_requests', 'stays_in_weekend_nights']]
# Select up to 3 continuous features for plotting, prioritizing those most relevant
plot_features = []
if 'lead_time' in continuous_features_for_plot: plot_features.append('lead_time')
if 'adr' in continuous_features_for_plot: plot_features.append('adr')
if 'booking_changes' in continuous_features_for_plot: plot_features.append('booking_changes')
if len(plot_features) < 3:
if 'total_of_special_requests' in continuous_features_for_plot and 'total_of_special_requests' not in plot_features: plot_features.append('total_of_special_requests')
if len(plot_features) < 3:
if 'stays_in_week_nights' in continuous_features_for_plot and 'stays_in_week_nights' not in plot_features: plot_features.append('stays_in_week_nights')
plot_features = plot_features[:3] # Ensure max 3 plots
if not plot_features:
st.write("No suitable continuous features found in RFE selection for plotting logistic curves.")
else:
for feature in plot_features:
plot_logistic_curve(feature, result, X_rfe, y, selected_rfe_features)