Matvanc's picture
Update app.py
4bf1890 verified
"""
Hotel Analytics Dashboard β€” Team A8
AI for Big Data Management (SE21) β€” ESCP Business School 2026
HuggingFace Space: Gradio App with 3 tabs
"""
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os
import subprocess
import io
# ─────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────
def load_csv_safe(name):
"""Try to load a CSV from the current directory."""
if os.path.exists(name):
return pd.read_csv(name)
return None
# ─────────────────────────────────────────────
# TAB 1 β€” PIPELINE RUNNER
# ─────────────────────────────────────────────
def run_notebook(notebook_name):
"""Execute a Jupyter notebook via nbconvert."""
if not os.path.exists(notebook_name):
return f"❌ File not found: {notebook_name}\n\nPlease upload the notebook to the Space files."
try:
result = subprocess.run(
["jupyter", "nbconvert", "--to", "notebook", "--execute",
"--ExecutePreprocessor.timeout=600", notebook_name],
capture_output=True, text=True, timeout=660
)
if result.returncode == 0:
return f"βœ… {notebook_name} executed successfully!\n\n{result.stdout[-500:] if result.stdout else 'Done.'}"
else:
return f"⚠️ {notebook_name} finished with warnings:\n\n{result.stderr[-1000:]}"
except subprocess.TimeoutExpired:
return f"⏰ {notebook_name} timed out after 10 minutes."
except Exception as e:
return f"❌ Error: {str(e)}"
def run_pipeline():
"""Run both notebooks in sequence."""
log = "πŸš€ Starting full pipeline...\n\n"
log += "━" * 40 + "\n"
log += "πŸ““ Step 1: datacreation.ipynb\n"
log += "━" * 40 + "\n"
log += run_notebook("datacreation.ipynb") + "\n\n"
log += "━" * 40 + "\n"
log += "πŸ““ Step 2: pythonanalysis.ipynb\n"
log += "━" * 40 + "\n"
log += run_notebook("pythonanalysis.ipynb") + "\n\n"
log += "βœ… Pipeline complete! Switch to the Dashboard tab to view results."
return log
def run_nb1():
return run_notebook("datacreation.ipynb")
def run_nb2():
return run_notebook("pythonanalysis.ipynb")
# ─────────────────────────────────────────────
# TAB 2 β€” DASHBOARD
# ─────────────────────────────────────────────
def build_dashboard():
"""Generate 4 analytical charts from pipeline outputs."""
# Correct filenames matching Notebook 1 outputs
df_bookings = load_csv_safe("hotel_bookings_cleaned.csv")
df_reviews = load_csv_safe("synthetic_hotel_reviews.csv")
df_monthly = load_csv_safe("monthly_hotel_revenue.csv")
if df_bookings is None and df_reviews is None and df_monthly is None:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
ax.text(0.5, 0.5,
"No data found.\n\nRun the Pipeline first (Tab 1)\nor upload the CSV files.",
ha='center', va='center', fontsize=16, color='gray',
transform=ax.transAxes)
ax.set_axis_off()
return fig
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle("Hotel Analytics Dashboard β€” Team A8", fontsize=16, fontweight='bold', y=0.98)
# ── Chart 1: Sentiment Distribution ──
ax = axes[0, 0]
if df_reviews is not None and 'sentiment_label' in df_reviews.columns:
colors_map = {'positive': '#2ecc71', 'neutral': '#f39c12', 'negative': '#e74c3c'}
counts = df_reviews['sentiment_label'].value_counts()
bars = ax.bar(counts.index, counts.values,
color=[colors_map.get(s, '#95a5a6') for s in counts.index])
ax.set_title("Guest Sentiment Distribution", fontweight='bold')
ax.set_ylabel("Number of Reviews")
for bar, val in zip(bars, counts.values):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20,
str(val), ha='center', fontsize=10)
else:
ax.text(0.5, 0.5, "No review data", ha='center', va='center', transform=ax.transAxes)
ax.set_axis_off()
# ── Chart 2: Monthly Revenue ──
ax = axes[0, 1]
if df_monthly is not None and 'total_revenue' in df_monthly.columns:
if 'date' in df_monthly.columns:
df_monthly['date'] = pd.to_datetime(df_monthly['date'])
ax.plot(df_monthly['date'], df_monthly['total_revenue'], 'b-o', markersize=4)
ax.tick_params(axis='x', rotation=45)
else:
ax.plot(df_monthly['total_revenue'].values, 'b-o', markersize=4)
ax.set_title("Monthly Revenue Trend", fontweight='bold')
ax.set_ylabel("Revenue (€)")
ax.grid(True, alpha=0.3)
else:
ax.text(0.5, 0.5, "No revenue data", ha='center', va='center', transform=ax.transAxes)
ax.set_axis_off()
# ── Chart 3: ADR by Hotel Type ──
ax = axes[1, 0]
if df_bookings is not None and 'adr' in df_bookings.columns and 'hotel' in df_bookings.columns:
adr_by_type = df_bookings.groupby('hotel')['adr'].mean()
bars = ax.barh(adr_by_type.index, adr_by_type.values, color=['#3498db', '#2980b9'])
ax.set_title("Average Daily Rate by Hotel Type", fontweight='bold')
ax.set_xlabel("ADR (€)")
for bar, val in zip(bars, adr_by_type.values):
ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
f"€{val:.0f}", va='center', fontsize=11)
else:
ax.text(0.5, 0.5, "No booking data", ha='center', va='center', transform=ax.transAxes)
ax.set_axis_off()
# ── Chart 4: Cancellation by Sentiment ──
ax = axes[1, 1]
if df_reviews is not None and 'is_canceled' in df_reviews.columns and 'sentiment_label' in df_reviews.columns:
cancel_by_sent = df_reviews.groupby('sentiment_label')['is_canceled'].mean() * 100
order = ['positive', 'neutral', 'negative']
cancel_by_sent = cancel_by_sent.reindex(order).dropna()
colors = ['#2ecc71', '#f39c12', '#e74c3c']
bars = ax.bar(cancel_by_sent.index, cancel_by_sent.values,
color=colors[:len(cancel_by_sent)])
ax.set_title("Cancellation Rate by Sentiment", fontweight='bold')
ax.set_ylabel("Cancellation Rate (%)")
for bar, val in zip(bars, cancel_by_sent.values):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
f"{val:.1f}%", ha='center', fontsize=10)
else:
ax.text(0.5, 0.5, "No cancellation data", ha='center', va='center', transform=ax.transAxes)
ax.set_axis_off()
plt.tight_layout(rect=[0, 0, 1, 0.95])
return fig
# ─────────────────────────────────────────────
# TAB 3 β€” AI DASHBOARD (keyword Q&A)
# ─────────────────────────────────────────────
def ai_answer(question):
"""Simple keyword-based Q&A over the data."""
if not question or not question.strip():
return "Please enter a question about the hotel data."
q = question.lower()
df_bookings = load_csv_safe("hotel_bookings_cleaned.csv")
df_reviews = load_csv_safe("synthetic_hotel_reviews.csv")
df_monthly = load_csv_safe("monthly_hotel_revenue.csv")
if df_bookings is None and df_reviews is None:
return ("⚠️ No data available yet. Please run the Pipeline first (Tab 1) "
"to generate the CSV files.")
# Revenue / ADR questions
if any(w in q for w in ['revenue', 'money', 'income', 'earn']):
if df_monthly is not None:
total = df_monthly['total_revenue'].sum()
avg = df_monthly['total_revenue'].mean()
peak = df_monthly.loc[df_monthly['total_revenue'].idxmax()]
return (f"πŸ“Š **Revenue Analysis**\n\n"
f"β€’ Total revenue across all months: **€{total:,.0f}**\n"
f"β€’ Average monthly revenue: **€{avg:,.0f}**\n"
f"β€’ Peak month: **{peak['date']}** with €{peak['total_revenue']:,.0f}")
if any(w in q for w in ['adr', 'price', 'rate', 'pricing', 'cost']):
if df_bookings is not None:
adr_by_type = df_bookings.groupby('hotel')['adr'].mean()
overall = df_bookings['adr'].mean()
return (f"πŸ“Š **Pricing Analysis**\n\n"
f"β€’ Overall average daily rate: **€{overall:.2f}**\n" +
"\n".join([f"β€’ {h}: **€{v:.2f}**" for h, v in adr_by_type.items()]))
# Sentiment questions
if any(w in q for w in ['sentiment', 'review', 'opinion', 'feeling', 'satisfaction']):
if df_reviews is not None:
dist = df_reviews['sentiment_label'].value_counts()
total = len(df_reviews)
return (f"πŸ“Š **Sentiment Analysis**\n\n"
f"β€’ Total reviews analyzed: **{total:,}**\n" +
"\n".join([f"β€’ {s}: **{c}** ({c/total*100:.1f}%)" for s, c in dist.items()]))
# Cancellation questions
if any(w in q for w in ['cancel', 'cancellation']):
if df_bookings is not None:
rate = df_bookings['is_canceled'].mean() * 100
by_type = df_bookings.groupby('hotel')['is_canceled'].mean() * 100
return (f"πŸ“Š **Cancellation Analysis**\n\n"
f"β€’ Overall cancellation rate: **{rate:.1f}%**\n" +
"\n".join([f"β€’ {h}: **{v:.1f}%**" for h, v in by_type.items()]))
# Booking / occupancy questions
if any(w in q for w in ['booking', 'occupancy', 'guest', 'stay', 'night']):
if df_bookings is not None:
total = len(df_bookings)
avg_nights = df_bookings['total_nights'].mean()
top_country = df_bookings['country'].value_counts().head(5)
return (f"πŸ“Š **Booking Analysis**\n\n"
f"β€’ Total bookings: **{total:,}**\n"
f"β€’ Average stay length: **{avg_nights:.1f} nights**\n"
f"β€’ Top 5 countries:\n" +
"\n".join([f" {c}: {n:,}" for c, n in top_country.items()]))
# Fallback
return ("πŸ€” I can answer questions about:\n"
"β€’ **Revenue** and pricing trends\n"
"β€’ **Sentiment** analysis of guest reviews\n"
"β€’ **Cancellation** rates and patterns\n"
"β€’ **Booking** statistics and guest demographics\n\n"
"Try asking something like: *'What is the average hotel price?'*")
# ─────────────────────────────────────────────
# GRADIO APP
# ─────────────────────────────────────────────
with gr.Blocks(title="Hotel Analytics β€” Team A8", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🏨 Hotel Analytics Dashboard β€” Team A8
**AI for Big Data Management (SE21) β€” ESCP Business School 2026**
*Luxury hotel pricing optimization through sentiment analysis and time-series forecasting.*
""")
with gr.Tab("πŸš€ Pipeline Runner"):
gr.Markdown("Run the data pipeline to generate analytical outputs.")
with gr.Row():
btn_all = gr.Button("▢️ Run Full Pipeline", variant="primary", scale=2)
btn_nb1 = gr.Button("πŸ““ Run Notebook 1 Only", scale=1)
btn_nb2 = gr.Button("πŸ““ Run Notebook 2 Only", scale=1)
output_log = gr.Textbox(label="Execution Log", lines=20, interactive=False)
btn_all.click(fn=run_pipeline, outputs=output_log)
btn_nb1.click(fn=run_nb1, outputs=output_log)
btn_nb2.click(fn=run_nb2, outputs=output_log)
with gr.Tab("πŸ“Š Dashboard"):
gr.Markdown("Visual analytics from the processed data. Click **Load** after running the pipeline.")
btn_dash = gr.Button("πŸ”„ Load / Refresh Dashboard", variant="primary")
plot_out = gr.Plot()
btn_dash.click(fn=build_dashboard, outputs=plot_out)
with gr.Tab("πŸ€– AI Dashboard"):
gr.Markdown("Ask questions about the hotel data in natural language.")
question = gr.Textbox(label="Your question", placeholder="e.g. What is the cancellation rate?")
answer = gr.Markdown(label="Answer")
btn_ask = gr.Button("Ask", variant="primary")
btn_ask.click(fn=ai_answer, inputs=question, outputs=answer)
gr.Examples(
examples=[
"What is the average hotel price?",
"Show me the sentiment distribution",
"What is the cancellation rate?",
"How much revenue was generated?",
"Tell me about booking patterns"
],
inputs=question
)
demo.launch()