File size: 13,548 Bytes
bd6fd15
4bf1890
 
 
bd6fd15
2b32d51
 
bd6fd15
 
4bf1890
 
 
bd6fd15
4bf1890
 
bd6fd15
4bf1890
 
 
 
 
 
 
 
 
2b32d51
4bf1890
 
 
2b32d51
4bf1890
 
 
 
2b32d51
bd6fd15
4bf1890
 
bd6fd15
 
 
4bf1890
bd6fd15
4bf1890
 
 
2b32d51
4bf1890
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b32d51
4bf1890
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b32d51
4bf1890
 
2b32d51
4bf1890
 
 
bd6fd15
 
4bf1890
bd6fd15
 
 
 
4bf1890
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b32d51
4bf1890
 
 
 
 
 
 
bd6fd15
 
4bf1890
 
 
 
2b32d51
bd6fd15
4bf1890
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b32d51
 
4bf1890
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
"""
Hotel Analytics Dashboard β€” Team A8
AI for Big Data Management (SE21) β€” ESCP Business School 2026
HuggingFace Space: Gradio App with 3 tabs
"""

import gradio as gr
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os
import subprocess
import io

# ─────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────

def load_csv_safe(name):
    """Try to load a CSV from the current directory."""
    if os.path.exists(name):
        return pd.read_csv(name)
    return None

# ─────────────────────────────────────────────
# TAB 1 β€” PIPELINE RUNNER
# ─────────────────────────────────────────────

def run_notebook(notebook_name):
    """Execute a Jupyter notebook via nbconvert."""
    if not os.path.exists(notebook_name):
        return f"❌ File not found: {notebook_name}\n\nPlease upload the notebook to the Space files."
    try:
        result = subprocess.run(
            ["jupyter", "nbconvert", "--to", "notebook", "--execute",
             "--ExecutePreprocessor.timeout=600", notebook_name],
            capture_output=True, text=True, timeout=660
        )
        if result.returncode == 0:
            return f"βœ… {notebook_name} executed successfully!\n\n{result.stdout[-500:] if result.stdout else 'Done.'}"
        else:
            return f"⚠️ {notebook_name} finished with warnings:\n\n{result.stderr[-1000:]}"
    except subprocess.TimeoutExpired:
        return f"⏰ {notebook_name} timed out after 10 minutes."
    except Exception as e:
        return f"❌ Error: {str(e)}"

def run_pipeline():
    """Run both notebooks in sequence."""
    log = "πŸš€ Starting full pipeline...\n\n"
    log += "━" * 40 + "\n"
    log += "πŸ““ Step 1: datacreation.ipynb\n"
    log += "━" * 40 + "\n"
    log += run_notebook("datacreation.ipynb") + "\n\n"
    log += "━" * 40 + "\n"
    log += "πŸ““ Step 2: pythonanalysis.ipynb\n"
    log += "━" * 40 + "\n"
    log += run_notebook("pythonanalysis.ipynb") + "\n\n"
    log += "βœ… Pipeline complete! Switch to the Dashboard tab to view results."
    return log

def run_nb1():
    return run_notebook("datacreation.ipynb")

def run_nb2():
    return run_notebook("pythonanalysis.ipynb")

# ─────────────────────────────────────────────
# TAB 2 β€” DASHBOARD
# ─────────────────────────────────────────────

def build_dashboard():
    """Generate 4 analytical charts from pipeline outputs."""
    # Correct filenames matching Notebook 1 outputs
    df_bookings = load_csv_safe("hotel_bookings_cleaned.csv")
    df_reviews  = load_csv_safe("synthetic_hotel_reviews.csv")
    df_monthly  = load_csv_safe("monthly_hotel_revenue.csv")

    if df_bookings is None and df_reviews is None and df_monthly is None:
        fig, ax = plt.subplots(1, 1, figsize=(10, 6))
        ax.text(0.5, 0.5,
                "No data found.\n\nRun the Pipeline first (Tab 1)\nor upload the CSV files.",
                ha='center', va='center', fontsize=16, color='gray',
                transform=ax.transAxes)
        ax.set_axis_off()
        return fig

    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle("Hotel Analytics Dashboard β€” Team A8", fontsize=16, fontweight='bold', y=0.98)

    # ── Chart 1: Sentiment Distribution ──
    ax = axes[0, 0]
    if df_reviews is not None and 'sentiment_label' in df_reviews.columns:
        colors_map = {'positive': '#2ecc71', 'neutral': '#f39c12', 'negative': '#e74c3c'}
        counts = df_reviews['sentiment_label'].value_counts()
        bars = ax.bar(counts.index, counts.values,
                      color=[colors_map.get(s, '#95a5a6') for s in counts.index])
        ax.set_title("Guest Sentiment Distribution", fontweight='bold')
        ax.set_ylabel("Number of Reviews")
        for bar, val in zip(bars, counts.values):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20,
                    str(val), ha='center', fontsize=10)
    else:
        ax.text(0.5, 0.5, "No review data", ha='center', va='center', transform=ax.transAxes)
        ax.set_axis_off()

    # ── Chart 2: Monthly Revenue ──
    ax = axes[0, 1]
    if df_monthly is not None and 'total_revenue' in df_monthly.columns:
        if 'date' in df_monthly.columns:
            df_monthly['date'] = pd.to_datetime(df_monthly['date'])
            ax.plot(df_monthly['date'], df_monthly['total_revenue'], 'b-o', markersize=4)
            ax.tick_params(axis='x', rotation=45)
        else:
            ax.plot(df_monthly['total_revenue'].values, 'b-o', markersize=4)
        ax.set_title("Monthly Revenue Trend", fontweight='bold')
        ax.set_ylabel("Revenue (€)")
        ax.grid(True, alpha=0.3)
    else:
        ax.text(0.5, 0.5, "No revenue data", ha='center', va='center', transform=ax.transAxes)
        ax.set_axis_off()

    # ── Chart 3: ADR by Hotel Type ──
    ax = axes[1, 0]
    if df_bookings is not None and 'adr' in df_bookings.columns and 'hotel' in df_bookings.columns:
        adr_by_type = df_bookings.groupby('hotel')['adr'].mean()
        bars = ax.barh(adr_by_type.index, adr_by_type.values, color=['#3498db', '#2980b9'])
        ax.set_title("Average Daily Rate by Hotel Type", fontweight='bold')
        ax.set_xlabel("ADR (€)")
        for bar, val in zip(bars, adr_by_type.values):
            ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
                    f"€{val:.0f}", va='center', fontsize=11)
    else:
        ax.text(0.5, 0.5, "No booking data", ha='center', va='center', transform=ax.transAxes)
        ax.set_axis_off()

    # ── Chart 4: Cancellation by Sentiment ──
    ax = axes[1, 1]
    if df_reviews is not None and 'is_canceled' in df_reviews.columns and 'sentiment_label' in df_reviews.columns:
        cancel_by_sent = df_reviews.groupby('sentiment_label')['is_canceled'].mean() * 100
        order = ['positive', 'neutral', 'negative']
        cancel_by_sent = cancel_by_sent.reindex(order).dropna()
        colors = ['#2ecc71', '#f39c12', '#e74c3c']
        bars = ax.bar(cancel_by_sent.index, cancel_by_sent.values,
                      color=colors[:len(cancel_by_sent)])
        ax.set_title("Cancellation Rate by Sentiment", fontweight='bold')
        ax.set_ylabel("Cancellation Rate (%)")
        for bar, val in zip(bars, cancel_by_sent.values):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    f"{val:.1f}%", ha='center', fontsize=10)
    else:
        ax.text(0.5, 0.5, "No cancellation data", ha='center', va='center', transform=ax.transAxes)
        ax.set_axis_off()

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    return fig

# ─────────────────────────────────────────────
# TAB 3 β€” AI DASHBOARD (keyword Q&A)
# ─────────────────────────────────────────────

def ai_answer(question):
    """Simple keyword-based Q&A over the data."""
    if not question or not question.strip():
        return "Please enter a question about the hotel data."

    q = question.lower()
    df_bookings = load_csv_safe("hotel_bookings_cleaned.csv")
    df_reviews  = load_csv_safe("synthetic_hotel_reviews.csv")
    df_monthly  = load_csv_safe("monthly_hotel_revenue.csv")

    if df_bookings is None and df_reviews is None:
        return ("⚠️ No data available yet. Please run the Pipeline first (Tab 1) "
                "to generate the CSV files.")

    # Revenue / ADR questions
    if any(w in q for w in ['revenue', 'money', 'income', 'earn']):
        if df_monthly is not None:
            total = df_monthly['total_revenue'].sum()
            avg = df_monthly['total_revenue'].mean()
            peak = df_monthly.loc[df_monthly['total_revenue'].idxmax()]
            return (f"πŸ“Š **Revenue Analysis**\n\n"
                    f"β€’ Total revenue across all months: **€{total:,.0f}**\n"
                    f"β€’ Average monthly revenue: **€{avg:,.0f}**\n"
                    f"β€’ Peak month: **{peak['date']}** with €{peak['total_revenue']:,.0f}")

    if any(w in q for w in ['adr', 'price', 'rate', 'pricing', 'cost']):
        if df_bookings is not None:
            adr_by_type = df_bookings.groupby('hotel')['adr'].mean()
            overall = df_bookings['adr'].mean()
            return (f"πŸ“Š **Pricing Analysis**\n\n"
                    f"β€’ Overall average daily rate: **€{overall:.2f}**\n" +
                    "\n".join([f"β€’ {h}: **€{v:.2f}**" for h, v in adr_by_type.items()]))

    # Sentiment questions
    if any(w in q for w in ['sentiment', 'review', 'opinion', 'feeling', 'satisfaction']):
        if df_reviews is not None:
            dist = df_reviews['sentiment_label'].value_counts()
            total = len(df_reviews)
            return (f"πŸ“Š **Sentiment Analysis**\n\n"
                    f"β€’ Total reviews analyzed: **{total:,}**\n" +
                    "\n".join([f"β€’ {s}: **{c}** ({c/total*100:.1f}%)" for s, c in dist.items()]))

    # Cancellation questions
    if any(w in q for w in ['cancel', 'cancellation']):
        if df_bookings is not None:
            rate = df_bookings['is_canceled'].mean() * 100
            by_type = df_bookings.groupby('hotel')['is_canceled'].mean() * 100
            return (f"πŸ“Š **Cancellation Analysis**\n\n"
                    f"β€’ Overall cancellation rate: **{rate:.1f}%**\n" +
                    "\n".join([f"β€’ {h}: **{v:.1f}%**" for h, v in by_type.items()]))

    # Booking / occupancy questions
    if any(w in q for w in ['booking', 'occupancy', 'guest', 'stay', 'night']):
        if df_bookings is not None:
            total = len(df_bookings)
            avg_nights = df_bookings['total_nights'].mean()
            top_country = df_bookings['country'].value_counts().head(5)
            return (f"πŸ“Š **Booking Analysis**\n\n"
                    f"β€’ Total bookings: **{total:,}**\n"
                    f"β€’ Average stay length: **{avg_nights:.1f} nights**\n"
                    f"β€’ Top 5 countries:\n" +
                    "\n".join([f"  {c}: {n:,}" for c, n in top_country.items()]))

    # Fallback
    return ("πŸ€” I can answer questions about:\n"
            "β€’ **Revenue** and pricing trends\n"
            "β€’ **Sentiment** analysis of guest reviews\n"
            "β€’ **Cancellation** rates and patterns\n"
            "β€’ **Booking** statistics and guest demographics\n\n"
            "Try asking something like: *'What is the average hotel price?'*")

# ─────────────────────────────────────────────
# GRADIO APP
# ─────────────────────────────────────────────

with gr.Blocks(title="Hotel Analytics β€” Team A8", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🏨 Hotel Analytics Dashboard β€” Team A8
    **AI for Big Data Management (SE21) β€” ESCP Business School 2026**

    *Luxury hotel pricing optimization through sentiment analysis and time-series forecasting.*
    """)

    with gr.Tab("πŸš€ Pipeline Runner"):
        gr.Markdown("Run the data pipeline to generate analytical outputs.")
        with gr.Row():
            btn_all  = gr.Button("▢️ Run Full Pipeline", variant="primary", scale=2)
            btn_nb1  = gr.Button("πŸ““ Run Notebook 1 Only", scale=1)
            btn_nb2  = gr.Button("πŸ““ Run Notebook 2 Only", scale=1)
        output_log = gr.Textbox(label="Execution Log", lines=20, interactive=False)
        btn_all.click(fn=run_pipeline, outputs=output_log)
        btn_nb1.click(fn=run_nb1,      outputs=output_log)
        btn_nb2.click(fn=run_nb2,      outputs=output_log)

    with gr.Tab("πŸ“Š Dashboard"):
        gr.Markdown("Visual analytics from the processed data. Click **Load** after running the pipeline.")
        btn_dash = gr.Button("πŸ”„ Load / Refresh Dashboard", variant="primary")
        plot_out = gr.Plot()
        btn_dash.click(fn=build_dashboard, outputs=plot_out)

    with gr.Tab("πŸ€– AI Dashboard"):
        gr.Markdown("Ask questions about the hotel data in natural language.")
        question = gr.Textbox(label="Your question", placeholder="e.g. What is the cancellation rate?")
        answer   = gr.Markdown(label="Answer")
        btn_ask  = gr.Button("Ask", variant="primary")
        btn_ask.click(fn=ai_answer, inputs=question, outputs=answer)

        gr.Examples(
            examples=[
                "What is the average hotel price?",
                "Show me the sentiment distribution",
                "What is the cancellation rate?",
                "How much revenue was generated?",
                "Tell me about booking patterns"
            ],
            inputs=question
        )

demo.launch()