hotel-analytics-a8

Sleeping

File size: 13,548 Bytes

"""
Hotel Analytics Dashboard — Team A8
AI for Big Data Management (SE21) — ESCP Business School 2026
HuggingFace Space: Gradio App with 3 tabs
"""

import gradio as gr
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os
import subprocess
import io

# ─────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────

def load_csv_safe(name):
    """Try to load a CSV from the current directory."""
    if os.path.exists(name):
        return pd.read_csv(name)
    return None

# ─────────────────────────────────────────────
# TAB 1 — PIPELINE RUNNER
# ─────────────────────────────────────────────

def run_notebook(notebook_name):
    """Execute a Jupyter notebook via nbconvert."""
    if not os.path.exists(notebook_name):
        return f"❌ File not found: {notebook_name}\n\nPlease upload the notebook to the Space files."
    try:
        result = subprocess.run(
            ["jupyter", "nbconvert", "--to", "notebook", "--execute",
             "--ExecutePreprocessor.timeout=600", notebook_name],
            capture_output=True, text=True, timeout=660
        )
        if result.returncode == 0:
            return f"✅ {notebook_name} executed successfully!\n\n{result.stdout[-500:] if result.stdout else 'Done.'}"
        else:
            return f"⚠️ {notebook_name} finished with warnings:\n\n{result.stderr[-1000:]}"
    except subprocess.TimeoutExpired:
        return f"⏰ {notebook_name} timed out after 10 minutes."
    except Exception as e:
        return f"❌ Error: {str(e)}"

def run_pipeline():
    """Run both notebooks in sequence."""
    log = "🚀 Starting full pipeline...\n\n"
    log += "━" * 40 + "\n"
    log += "📓 Step 1: datacreation.ipynb\n"
    log += "━" * 40 + "\n"
    log += run_notebook("datacreation.ipynb") + "\n\n"
    log += "━" * 40 + "\n"
    log += "📓 Step 2: pythonanalysis.ipynb\n"
    log += "━" * 40 + "\n"
    log += run_notebook("pythonanalysis.ipynb") + "\n\n"
    log += "✅ Pipeline complete! Switch to the Dashboard tab to view results."
    return log

def run_nb1():
    return run_notebook("datacreation.ipynb")

def run_nb2():
    return run_notebook("pythonanalysis.ipynb")

# ─────────────────────────────────────────────
# TAB 2 — DASHBOARD
# ─────────────────────────────────────────────

def build_dashboard():
    """Generate 4 analytical charts from pipeline outputs."""
    # Correct filenames matching Notebook 1 outputs
    df_bookings = load_csv_safe("hotel_bookings_cleaned.csv")
    df_reviews  = load_csv_safe("synthetic_hotel_reviews.csv")
    df_monthly  = load_csv_safe("monthly_hotel_revenue.csv")

    if df_bookings is None and df_reviews is None and df_monthly is None:
        fig, ax = plt.subplots(1, 1, figsize=(10, 6))
        ax.text(0.5, 0.5,
                "No data found.\n\nRun the Pipeline first (Tab 1)\nor upload the CSV files.",
                ha='center', va='center', fontsize=16, color='gray',
                transform=ax.transAxes)
        ax.set_axis_off()
        return fig

    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle("Hotel Analytics Dashboard — Team A8", fontsize=16, fontweight='bold', y=0.98)

    # ── Chart 1: Sentiment Distribution ──
    ax = axes[0, 0]
    if df_reviews is not None and 'sentiment_label' in df_reviews.columns:
        colors_map = {'positive': '#2ecc71', 'neutral': '#f39c12', 'negative': '#e74c3c'}
        counts = df_reviews['sentiment_label'].value_counts()
        bars = ax.bar(counts.index, counts.values,
                      color=[colors_map.get(s, '#95a5a6') for s in counts.index])
        ax.set_title("Guest Sentiment Distribution", fontweight='bold')
        ax.set_ylabel("Number of Reviews")
        for bar, val in zip(bars, counts.values):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20,
                    str(val), ha='center', fontsize=10)
    else:
        ax.text(0.5, 0.5, "No review data", ha='center', va='center', transform=ax.transAxes)
        ax.set_axis_off()

    # ── Chart 2: Monthly Revenue ──
    ax = axes[0, 1]
    if df_monthly is not None and 'total_revenue' in df_monthly.columns:
        if 'date' in df_monthly.columns:
            df_monthly['date'] = pd.to_datetime(df_monthly['date'])
            ax.plot(df_monthly['date'], df_monthly['total_revenue'], 'b-o', markersize=4)
            ax.tick_params(axis='x', rotation=45)
        else:
            ax.plot(df_monthly['total_revenue'].values, 'b-o', markersize=4)
        ax.set_title("Monthly Revenue Trend", fontweight='bold')
        ax.set_ylabel("Revenue (€)")
        ax.grid(True, alpha=0.3)
    else:
        ax.text(0.5, 0.5, "No revenue data", ha='center', va='center', transform=ax.transAxes)
        ax.set_axis_off()

    # ── Chart 3: ADR by Hotel Type ──
    ax = axes[1, 0]
    if df_bookings is not None and 'adr' in df_bookings.columns and 'hotel' in df_bookings.columns:
        adr_by_type = df_bookings.groupby('hotel')['adr'].mean()
        bars = ax.barh(adr_by_type.index, adr_by_type.values, color=['#3498db', '#2980b9'])
        ax.set_title("Average Daily Rate by Hotel Type", fontweight='bold')
        ax.set_xlabel("ADR (€)")
        for bar, val in zip(bars, adr_by_type.values):
            ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
                    f"€{val:.0f}", va='center', fontsize=11)
    else:
        ax.text(0.5, 0.5, "No booking data", ha='center', va='center', transform=ax.transAxes)
        ax.set_axis_off()

    # ── Chart 4: Cancellation by Sentiment ──
    ax = axes[1, 1]
    if df_reviews is not None and 'is_canceled' in df_reviews.columns and 'sentiment_label' in df_reviews.columns:
        cancel_by_sent = df_reviews.groupby('sentiment_label')['is_canceled'].mean() * 100
        order = ['positive', 'neutral', 'negative']
        cancel_by_sent = cancel_by_sent.reindex(order).dropna()
        colors = ['#2ecc71', '#f39c12', '#e74c3c']
        bars = ax.bar(cancel_by_sent.index, cancel_by_sent.values,
                      color=colors[:len(cancel_by_sent)])
        ax.set_title("Cancellation Rate by Sentiment", fontweight='bold')
        ax.set_ylabel("Cancellation Rate (%)")
        for bar, val in zip(bars, cancel_by_sent.values):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    f"{val:.1f}%", ha='center', fontsize=10)
    else:
        ax.text(0.5, 0.5, "No cancellation data", ha='center', va='center', transform=ax.transAxes)
        ax.set_axis_off()

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    return fig

# ─────────────────────────────────────────────
# TAB 3 — AI DASHBOARD (keyword Q&A)
# ─────────────────────────────────────────────

def ai_answer(question):
    """Simple keyword-based Q&A over the data."""
    if not question or not question.strip():
        return "Please enter a question about the hotel data."

    q = question.lower()
    df_bookings = load_csv_safe("hotel_bookings_cleaned.csv")
    df_reviews  = load_csv_safe("synthetic_hotel_reviews.csv")
    df_monthly  = load_csv_safe("monthly_hotel_revenue.csv")

    if df_bookings is None and df_reviews is None:
        return ("⚠️ No data available yet. Please run the Pipeline first (Tab 1) "
                "to generate the CSV files.")

    # Revenue / ADR questions
    if any(w in q for w in ['revenue', 'money', 'income', 'earn']):
        if df_monthly is not None:
            total = df_monthly['total_revenue'].sum()
            avg = df_monthly['total_revenue'].mean()
            peak = df_monthly.loc[df_monthly['total_revenue'].idxmax()]
            return (f"📊 **Revenue Analysis**\n\n"
                    f"• Total revenue across all months: **€{total:,.0f}**\n"
                    f"• Average monthly revenue: **€{avg:,.0f}**\n"
                    f"• Peak month: **{peak['date']}** with €{peak['total_revenue']:,.0f}")

    if any(w in q for w in ['adr', 'price', 'rate', 'pricing', 'cost']):
        if df_bookings is not None:
            adr_by_type = df_bookings.groupby('hotel')['adr'].mean()
            overall = df_bookings['adr'].mean()
            return (f"📊 **Pricing Analysis**\n\n"
                    f"• Overall average daily rate: **€{overall:.2f}**\n" +
                    "\n".join([f"• {h}: **€{v:.2f}**" for h, v in adr_by_type.items()]))

    # Sentiment questions
    if any(w in q for w in ['sentiment', 'review', 'opinion', 'feeling', 'satisfaction']):
        if df_reviews is not None:
            dist = df_reviews['sentiment_label'].value_counts()
            total = len(df_reviews)
            return (f"📊 **Sentiment Analysis**\n\n"
                    f"• Total reviews analyzed: **{total:,}**\n" +
                    "\n".join([f"• {s}: **{c}** ({c/total*100:.1f}%)" for s, c in dist.items()]))

    # Cancellation questions
    if any(w in q for w in ['cancel', 'cancellation']):
        if df_bookings is not None:
            rate = df_bookings['is_canceled'].mean() * 100
            by_type = df_bookings.groupby('hotel')['is_canceled'].mean() * 100
            return (f"📊 **Cancellation Analysis**\n\n"
                    f"• Overall cancellation rate: **{rate:.1f}%**\n" +
                    "\n".join([f"• {h}: **{v:.1f}%**" for h, v in by_type.items()]))

    # Booking / occupancy questions
    if any(w in q for w in ['booking', 'occupancy', 'guest', 'stay', 'night']):
        if df_bookings is not None:
            total = len(df_bookings)
            avg_nights = df_bookings['total_nights'].mean()
            top_country = df_bookings['country'].value_counts().head(5)
            return (f"📊 **Booking Analysis**\n\n"
                    f"• Total bookings: **{total:,}**\n"
                    f"• Average stay length: **{avg_nights:.1f} nights**\n"
                    f"• Top 5 countries:\n" +
                    "\n".join([f"  {c}: {n:,}" for c, n in top_country.items()]))

    # Fallback
    return ("🤔 I can answer questions about:\n"
            "• **Revenue** and pricing trends\n"
            "• **Sentiment** analysis of guest reviews\n"
            "• **Cancellation** rates and patterns\n"
            "• **Booking** statistics and guest demographics\n\n"
            "Try asking something like: *'What is the average hotel price?'*")

# ─────────────────────────────────────────────
# GRADIO APP
# ─────────────────────────────────────────────

with gr.Blocks(title="Hotel Analytics — Team A8", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🏨 Hotel Analytics Dashboard — Team A8
    **AI for Big Data Management (SE21) — ESCP Business School 2026**

    *Luxury hotel pricing optimization through sentiment analysis and time-series forecasting.*
    """)

    with gr.Tab("🚀 Pipeline Runner"):
        gr.Markdown("Run the data pipeline to generate analytical outputs.")
        with gr.Row():
            btn_all  = gr.Button("▶️ Run Full Pipeline", variant="primary", scale=2)
            btn_nb1  = gr.Button("📓 Run Notebook 1 Only", scale=1)
            btn_nb2  = gr.Button("📓 Run Notebook 2 Only", scale=1)
        output_log = gr.Textbox(label="Execution Log", lines=20, interactive=False)
        btn_all.click(fn=run_pipeline, outputs=output_log)
        btn_nb1.click(fn=run_nb1,      outputs=output_log)
        btn_nb2.click(fn=run_nb2,      outputs=output_log)

    with gr.Tab("📊 Dashboard"):
        gr.Markdown("Visual analytics from the processed data. Click **Load** after running the pipeline.")
        btn_dash = gr.Button("🔄 Load / Refresh Dashboard", variant="primary")
        plot_out = gr.Plot()
        btn_dash.click(fn=build_dashboard, outputs=plot_out)

    with gr.Tab("🤖 AI Dashboard"):
        gr.Markdown("Ask questions about the hotel data in natural language.")
        question = gr.Textbox(label="Your question", placeholder="e.g. What is the cancellation rate?")
        answer   = gr.Markdown(label="Answer")
        btn_ask  = gr.Button("Ask", variant="primary")
        btn_ask.click(fn=ai_answer, inputs=question, outputs=answer)

        gr.Examples(
            examples=[
                "What is the average hotel price?",
                "Show me the sentiment distribution",
                "What is the cancellation rate?",
                "How much revenue was generated?",
                "Tell me about booking patterns"
            ],
            inputs=question
        )

demo.launch()