Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc | |
| from sklearn.feature_selection import RFE | |
| from imblearn.over_sampling import SMOTE | |
| import statsmodels.api as sm | |
| from statsmodels.stats.outliers_influence import variance_inflation_factor | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import numpy as np | |
| st.set_page_config(layout="wide") | |
| st.title("Hotel Booking Cancellation Prediction Dashboard") | |
| def load_data(): | |
| """ | |
| Minha função para carregar os dados. Ela pega o CSV direto do meu repositório no GitHub | |
| e aplica a limpeza inicial que eu defini como a mais adequada para o projeto. | |
| """ | |
| # 1. Defini a URL direta para o meu arquivo CSV no GitHub. | |
| # Tive o cuidado de pegar o link para o arquivo "raw" (bruto), que é o que o pandas precisa. | |
| url = "https://raw.githubusercontent.com/pegumzs/Siep/main/hotel_bookings.csv" | |
| # 2. Usei o pandas para ler os dados da URL. Um try-except simples para garantir que a conexão funcione. | |
| try: | |
| df = pd.read_csv(url) | |
| except Exception as e: | |
| st.error(f"Não foi possível carregar os dados do GitHub. Erro: {e}") | |
| return None | |
| # 3. Defini uma lista de colunas para remover. Baseei essa decisão em algumas premissas: | |
| # - Colunas como 'company' e 'agent' tinham muitos dados nulos. | |
| # - Colunas como 'reservation_status' e 'reservation_status_date' são "vazamento de dados" (data leakage), | |
| # pois elas contêm a resposta que eu quero prever. Removê-las foi essencial para um modelo válido. | |
| # - Outras colunas, como as de data, eu decidi não usar para manter o modelo mais simples nesta fase. | |
| cols_to_drop = [ | |
| "company", "agent", "reservation_status_date", "arrival_date_week_number", | |
| "arrival_date_year", "assigned_room_type", "reserved_room_type", | |
| "days_in_waiting_list", "reservation_status", "country" | |
| ] | |
| df.drop(columns=cols_to_drop, errors="ignore", inplace=True) | |
| # 4. Optei por uma estratégia de limpeza direta: remover qualquer linha que ainda tivesse dados faltantes. | |
| # Isso garante que meu modelo só vai treinar com registros 100% completos. | |
| df.dropna(inplace=True) | |
| # 5. Notei que a coluna 'adr' (preço) podia ter valores não numéricos. | |
| # Aqui eu forço a conversão para número e removo as linhas onde isso não for possível. | |
| df['adr'] = pd.to_numeric(df['adr'], errors='coerce') | |
| df.dropna(subset=['adr'], inplace=True) | |
| return df | |
| def train_model(df): | |
| # Select features (independent variables) and target variable | |
| continuous_features = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', | |
| 'booking_changes', 'adr', 'total_of_special_requests'] | |
| categorical_features = ['hotel', 'market_segment', 'deposit_type', | |
| 'customer_type', 'arrival_date_month'] | |
| selected_features = continuous_features + categorical_features | |
| df_processed = pd.get_dummies(df[selected_features], columns=categorical_features, drop_first=True) | |
| df_processed['is_canceled'] = df['is_canceled'] | |
| X = df_processed.drop('is_canceled', axis=1) | |
| y = df_processed['is_canceled'] | |
| # RFE Feature Selection | |
| model_rfe_estimator = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000) | |
| num_features_to_select = 15 | |
| rfe = RFE(estimator=model_rfe_estimator, n_features_to_select=num_features_to_select) | |
| rfe.fit(X, y) | |
| selected_rfe_features = X.columns[rfe.support_] | |
| X_rfe = X[selected_rfe_features] | |
| # Split data | |
| X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = train_test_split(X_rfe, y, test_size=0.3, random_state=42) | |
| # Apply SMOTE | |
| smote = SMOTE(random_state=42) | |
| X_train_resampled, y_train_resampled = smote.fit_resample(X_train_rfe, y_train_rfe) | |
| # Add constant for statsmodels | |
| X_train_resampled_sm = sm.add_constant(X_train_resampled) | |
| X_test_rfe_sm = sm.add_constant(X_test_rfe) | |
| # Ensure all columns are numeric and align columns | |
| for col in X_train_resampled_sm.columns: | |
| X_train_resampled_sm[col] = pd.to_numeric(X_train_resampled_sm[col], errors='coerce') | |
| for col in X_test_rfe_sm.columns: | |
| X_test_rfe_sm[col] = pd.to_numeric(X_test_rfe_sm[col], errors='coerce') | |
| X_train_resampled_sm.dropna(axis=1, inplace=True) | |
| X_test_rfe_sm.dropna(axis=1, inplace=True) | |
| common_cols = list(set(X_train_resampled_sm.columns) & set(X_test_rfe_sm.columns)) | |
| X_train_resampled_sm = X_train_resampled_sm[common_cols] | |
| X_test_rfe_sm = X_test_rfe_sm[common_cols] | |
| X_test_rfe_sm = X_test_rfe_sm[X_train_resampled_sm.columns] # Reorder columns | |
| # Train Statsmodels Logistic Regression | |
| logit_model = sm.Logit(y_train_resampled, X_train_resampled_sm) | |
| result = logit_model.fit() | |
| return result, X_test_rfe_sm, y_test_rfe, X_rfe, y, selected_rfe_features | |
| # Main application logic | |
| df = load_data() | |
| result, X_test_rfe_sm, y_test_rfe, X_rfe, y, selected_rfe_features = train_model(df) | |
| # --- Display Model Summary and Coefficients --- | |
| st.header("Model Interpretation: Coefficients, Odds Ratios, and P-values") | |
| coefficients = pd.DataFrame({ | |
| 'Feature': result.params.index, | |
| 'Log-Odds': result.params.values, | |
| 'Odds Ratio': np.exp(result.params.values), | |
| 'P-value': result.pvalues.values | |
| }) | |
| coefficients = coefficients.sort_values(by='Odds Ratio', ascending=False) | |
| st.dataframe(coefficients) | |
| # --- Multicollinearity Check (VIF) --- | |
| st.header("Multicollinearity Check (VIF)") | |
| vif_data = pd.DataFrame() | |
| vif_data["feature"] = X_test_rfe_sm.columns | |
| vif_data["VIF"] = [variance_inflation_factor(X_test_rfe_sm.values, i) | |
| for i in range(X_test_rfe_sm.shape[1])] | |
| vif_data = vif_data.sort_values(by="VIF", ascending=False) | |
| st.dataframe(vif_data) | |
| # --- Model Evaluation --- | |
| st.header("Model Evaluation") | |
| y_pred_smote_prob = result.predict(X_test_rfe_sm) | |
| y_pred_smote = (y_pred_smote_prob >= 0.5).astype(int) | |
| st.subheader("Accuracy") | |
| st.write(f"Model Accuracy: {accuracy_score(y_test_rfe, y_pred_smote):.4f}") | |
| st.subheader("Confusion Matrix") | |
| cm = confusion_matrix(y_test_rfe, y_pred_smote) | |
| st.write(cm) | |
| st.subheader("Classification Report") | |
| st.text(classification_report(y_test_rfe, y_pred_smote)) | |
| # --- ROC Curve and AUC --- | |
| st.header("ROC Curve and AUC") | |
| fpr, tpr, thresholds = roc_curve(y_test_rfe, y_pred_smote_prob) | |
| roc_auc = auc(fpr, tpr) | |
| fig_roc, ax_roc = plt.subplots(figsize=(8, 6)) | |
| ax_roc.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc) | |
| ax_roc.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') | |
| ax_roc.set_xlim([0.0, 1.0]) | |
| ax_roc.set_ylim([0.0, 1.05]) | |
| ax_roc.set_xlabel('False Positive Rate') | |
| ax_roc.set_ylabel('True Positive Rate') | |
| ax_roc.set_title('Receiver Operating Characteristic (ROC) Curve') | |
| ax_roc.legend(loc="lower right") | |
| st.pyplot(fig_roc) | |
| # --- Logistic Curves for Selected Continuous Variables --- | |
| st.header("Logistic Curves for Selected Continuous Variables") | |
| def plot_logistic_curve(feature_name, model_results, X_data, y_data, selected_features): | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| sns.scatterplot(x=X_data[feature_name], y=y_data, alpha=0.1, label='Actual Cancellation (0/1)', ax=ax) | |
| x_range = np.linspace(X_data[feature_name].min(), X_data[feature_name].max(), 500) | |
| # Create a DataFrame for prediction that matches the model's features, with other variables at their means | |
| # Ensure 'const' is also present if it was in the training data | |
| if 'const' in model_results.params.index: | |
| mean_values = X_data.drop(columns=[feature_name], errors='ignore').mean().to_dict() | |
| predictions_df = pd.DataFrame([mean_values] * len(x_range), columns=X_data.drop(columns=[feature_name], errors='ignore').columns) | |
| predictions_df[feature_name] = x_range | |
| predictions_df = sm.add_constant(predictions_df, prepend=True) # Add constant first | |
| else: | |
| mean_values = X_data.drop(columns=[feature_name], errors='ignore').mean().to_dict() | |
| predictions_df = pd.DataFrame([mean_values] * len(x_range), columns=X_data.drop(columns=[feature_name], errors='ignore').columns) | |
| predictions_df[feature_name] = x_range | |
| # Ensure the order of columns in predictions_df matches the model's expected order | |
| # This is crucial because `result.predict` relies on the order of features | |
| predictions_df = predictions_df[model_results.params.index.drop('const', errors='ignore')] | |
| # Add constant back if it was in the original model | |
| if 'const' in model_results.params.index: | |
| predictions_df = sm.add_constant(predictions_df, prepend=True) | |
| y_proba = model_results.predict(predictions_df) | |
| ax.plot(x_range, y_proba, color='red', lw=2, label='Predicted Probability of Cancellation') | |
| ax.set_title(f'Logistic Curve for {feature_name}') | |
| ax.set_xlabel(feature_name) | |
| ax.set_ylabel('Probability of Cancellation') | |
| ax.legend() | |
| ax.grid(True) | |
| st.pyplot(fig) | |
| continuous_features_for_plot = [f for f in selected_rfe_features if f in ['lead_time', 'adr', 'booking_changes', 'stays_in_week_nights', 'total_of_special_requests', 'stays_in_weekend_nights']] | |
| # Select up to 3 continuous features for plotting, prioritizing those most relevant | |
| plot_features = [] | |
| if 'lead_time' in continuous_features_for_plot: plot_features.append('lead_time') | |
| if 'adr' in continuous_features_for_plot: plot_features.append('adr') | |
| if 'booking_changes' in continuous_features_for_plot: plot_features.append('booking_changes') | |
| if len(plot_features) < 3: | |
| if 'total_of_special_requests' in continuous_features_for_plot and 'total_of_special_requests' not in plot_features: plot_features.append('total_of_special_requests') | |
| if len(plot_features) < 3: | |
| if 'stays_in_week_nights' in continuous_features_for_plot and 'stays_in_week_nights' not in plot_features: plot_features.append('stays_in_week_nights') | |
| plot_features = plot_features[:3] # Ensure max 3 plots | |
| if not plot_features: | |
| st.write("No suitable continuous features found in RFE selection for plotting logistic curves.") | |
| else: | |
| for feature in plot_features: | |
| plot_logistic_curve(feature, result, X_rfe, y, selected_rfe_features) |