Spaces:

ESCP
/

SE21AppTemplate

Sleeping

App Files Files Community

146

Update app.py

#131

by Moha2266 - opened Apr 27

base: refs/heads/main

←

from: refs/pr/131

Discussion Files changed

+863

-619

Files changed (1) hide show

app.py +863 -619

app.py CHANGED Viewed

@@ -1,17 +1,15 @@
 import os
 import re
 import json
-import time
-import traceback
 from pathlib import Path
-from typing import Dict, Any, List, Tuple
 import pandas as pd
 import gradio as gr
-import papermill as pm
 import plotly.graph_objects as go
-# Optional LLM (HuggingFace Inference API)
 try:
     from huggingface_hub import InferenceClient
 except Exception:
@@ -22,737 +20,983 @@ except Exception:
 # =========================================================
 BASE_DIR = Path(__file__).resolve().parent
-NB1 = os.environ.get("NB1", "datacreation.ipynb").strip()
-NB2 = os.environ.get("NB2", "pythonanalysis.ipynb").strip()
-RUNS_DIR = BASE_DIR / "runs"
-ART_DIR = BASE_DIR / "artifacts"
-PY_FIG_DIR = ART_DIR / "py" / "figures"
-PY_TAB_DIR = ART_DIR / "py" / "tables"
-PAPERMILL_TIMEOUT = int(os.environ.get("PAPERMILL_TIMEOUT", "1800"))
-MAX_PREVIEW_ROWS = int(os.environ.get("MAX_FILE_PREVIEW_ROWS", "50"))
-MAX_LOG_CHARS = int(os.environ.get("MAX_LOG_CHARS", "8000"))
 HF_API_KEY = os.environ.get("HF_API_KEY", "").strip()
-MODEL_NAME = os.environ.get("MODEL_NAME", "deepseek-ai/DeepSeek-R1").strip()
-HF_PROVIDER = os.environ.get("HF_PROVIDER", "novita").strip()
 N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip()
 LLM_ENABLED = bool(HF_API_KEY) and InferenceClient is not None
-llm_client = (
-    InferenceClient(provider=HF_PROVIDER, api_key=HF_API_KEY)
-    if LLM_ENABLED
-    else None
-)
 # =========================================================
 # HELPERS
 # =========================================================
-def ensure_dirs():
-    for p in [RUNS_DIR, ART_DIR, PY_FIG_DIR, PY_TAB_DIR]:
-        p.mkdir(parents=True, exist_ok=True)
-def stamp():
-    return time.strftime("%Y%m%d-%H%M%S")
-def tail(text: str, n: int = MAX_LOG_CHARS) -> str:
-    return (text or "")[-n:]
-def _ls(dir_path: Path, exts: Tuple[str, ...]) -> List[str]:
-    if not dir_path.is_dir():
-        return []
-    return sorted(p.name for p in dir_path.iterdir() if p.is_file() and p.suffix.lower() in exts)
-def _read_csv(path: Path) -> pd.DataFrame:
-    return pd.read_csv(path, nrows=MAX_PREVIEW_ROWS)
-def _read_json(path: Path):
-    with path.open(encoding="utf-8") as f:
-        return json.load(f)
-def artifacts_index() -> Dict[str, Any]:
-    return {
-        "python": {
-            "figures": _ls(PY_FIG_DIR, (".png", ".jpg", ".jpeg")),
-            "tables": _ls(PY_TAB_DIR, (".csv", ".json")),
-        },
-    }
 # =========================================================
-# PIPELINE RUNNERS
 # =========================================================
-def run_notebook(nb_name: str) -> str:
-    ensure_dirs()
-    nb_in = BASE_DIR / nb_name
-    if not nb_in.exists():
-        return f"ERROR: {nb_name} not found."
-    nb_out = RUNS_DIR / f"run_{stamp()}_{nb_name}"
-    pm.execute_notebook(
-        input_path=str(nb_in),
-        output_path=str(nb_out),
-        cwd=str(BASE_DIR),
-        log_output=True,
-        progress_bar=False,
-        request_save_on_cell_execute=True,
-        execution_timeout=PAPERMILL_TIMEOUT,
-    )
-    return f"Executed {nb_name}"
-def run_datacreation() -> str:
-    try:
-        log = run_notebook(NB1)
-        csvs = [f.name for f in BASE_DIR.glob("*.csv")]
-        return f"OK  {log}\n\nCSVs now in /app:\n" + "\n".join(f"  - {c}" for c in sorted(csvs))
-    except Exception as e:
-        return f"FAILED  {e}\n\n{traceback.format_exc()[-2000:]}"
-def run_pythonanalysis() -> str:
-    try:
-        log = run_notebook(NB2)
-        idx = artifacts_index()
-        figs = idx["python"]["figures"]
-        tabs = idx["python"]["tables"]
-        return (
-            f"OK  {log}\n\n"
-            f"Figures: {', '.join(figs) or '(none)'}\n"
-            f"Tables:  {', '.join(tabs) or '(none)'}"
         )
-    except Exception as e:
-        return f"FAILED  {e}\n\n{traceback.format_exc()[-2000:]}"
-def run_full_pipeline() -> str:
-    logs = []
-    logs.append("=" * 50)
-    logs.append("STEP 1/2: Data Creation (web scraping + synthetic data)")
-    logs.append("=" * 50)
-    logs.append(run_datacreation())
-    logs.append("")
-    logs.append("=" * 50)
-    logs.append("STEP 2/2: Python Analysis (sentiment, ARIMA, dashboard)")
-    logs.append("=" * 50)
-    logs.append(run_pythonanalysis())
-    return "\n".join(logs)
-# =========================================================
-# GALLERY LOADERS
-# =========================================================
-def _load_all_figures() -> List[Tuple[str, str]]:
-    """Return list of (filepath, caption) for Gallery."""
-    items = []
-    for p in sorted(PY_FIG_DIR.glob("*.png")):
-        items.append((str(p), p.stem.replace('_', ' ').title()))
-    return items
-def _load_table_safe(path: Path) -> pd.DataFrame:
-    try:
-        if path.suffix == ".json":
-            obj = _read_json(path)
-            if isinstance(obj, dict):
-                return pd.DataFrame([obj])
-            return pd.DataFrame(obj)
-        return _read_csv(path)
-    except Exception as e:
-        return pd.DataFrame([{"error": str(e)}])
-def refresh_gallery():
-    """Called when user clicks Refresh on Gallery tab."""
-    figures = _load_all_figures()
-    idx = artifacts_index()
-    table_choices = list(idx["python"]["tables"])
-    default_df = pd.DataFrame()
-    if table_choices:
-        default_df = _load_table_safe(PY_TAB_DIR / table_choices[0])
-    return (
-        figures if figures else [],
-        gr.update(choices=table_choices, value=table_choices[0] if table_choices else None),
-        default_df,
     )
-def on_table_select(choice: str):
-    if not choice:
-        return pd.DataFrame([{"hint": "Select a table above."}])
-    path = PY_TAB_DIR / choice
-    if not path.exists():
-        return pd.DataFrame([{"error": f"File not found: {choice}"}])
-    return _load_table_safe(path)
 # =========================================================
-# KPI LOADER
 # =========================================================
-def load_kpis() -> Dict[str, Any]:
-    for candidate in [PY_TAB_DIR / "kpis.json", PY_FIG_DIR / "kpis.json"]:
-        if candidate.exists():
-            try:
-                return _read_json(candidate)
-            except Exception:
-                pass
-    return {}
-# =========================================================
-# AI DASHBOARD -- LLM picks what to display
-# =========================================================
-DASHBOARD_SYSTEM = """You are an AI dashboard assistant for a book-sales analytics app.
-The user asks questions or requests about their data. You have access to pre-computed
-artifacts from a Python analysis pipeline.
-AVAILABLE ARTIFACTS (only reference ones that exist):
-{artifacts_json}
-KPI SUMMARY: {kpis_json}
-YOUR JOB:
-1. Answer the user's question conversationally using the KPIs and your knowledge of the artifacts.
-2. At the END of your response, output a JSON block (fenced with ```json ... ```) that tells
-   the dashboard which artifact to display. The JSON must have this shape:
-   {{"show": "figure"|"table"|"none", "scope": "python", "filename": "..."}}
-   - Use "show": "figure" to display a chart image.
-   - Use "show": "table" to display a CSV/JSON table.
-   - Use "show": "none" if no artifact is relevant.
-RULES:
-- If the user asks about sales trends or forecasting by title, show sales_trends or arima figures.
-- If the user asks about sentiment, show sentiment figure or sentiment_counts table.
-- If the user asks about forecast accuracy or ARIMA, show arima figures.
-- If the user asks about top sellers, show top_titles_by_units_sold.csv.
-- If the user asks a general data question, pick the most relevant artifact.
-- Keep your answer concise (2-4 sentences), then the JSON block.
-"""
-JSON_BLOCK_RE = re.compile(r"```json\s*(\{.*?\})\s*```", re.DOTALL)
-FALLBACK_JSON_RE = re.compile(r"\{[^{}]*\"show\"[^{}]*\}", re.DOTALL)
-def _parse_display_directive(text: str) -> Dict[str, str]:
-    m = JSON_BLOCK_RE.search(text)
-    if m:
-        try:
-            return json.loads(m.group(1))
-        except json.JSONDecodeError:
-            pass
-    m = FALLBACK_JSON_RE.search(text)
-    if m:
-        try:
-            return json.loads(m.group(0))
-        except json.JSONDecodeError:
-            pass
-    return {"show": "none"}
-def _clean_response(text: str) -> str:
-    """Strip the JSON directive block from the displayed response."""
-    return JSON_BLOCK_RE.sub("", text).strip()
-def _n8n_call(msg: str) -> Tuple[str, Dict]:
-    """Call the student's n8n webhook and return (reply, directive)."""
-    import requests as req
-    try:
-        resp = req.post(N8N_WEBHOOK_URL, json={"question": msg}, timeout=20)
-        data = resp.json()
-        answer = data.get("answer", "No response from n8n workflow.")
-        chart = data.get("chart", "none")
-        if chart and chart != "none":
-            return answer, {"show": "figure", "chart": chart}
-        return answer, {"show": "none"}
-    except Exception as e:
-        return f"n8n error: {e}. Falling back to keyword matching.", None
-def ai_chat(user_msg: str, history: list):
-    """Chat function for the AI Dashboard tab."""
-    if not user_msg or not user_msg.strip():
-        return history, "", None, None
-    idx = artifacts_index()
-    kpis = load_kpis()
-    # Priority: n8n webhook > HF LLM > keyword fallback
-    if N8N_WEBHOOK_URL:
-        reply, directive = _n8n_call(user_msg)
-        if directive is None:
-            reply_fb, directive = _keyword_fallback(user_msg, idx, kpis)
-            reply += "\n\n" + reply_fb
-    elif not LLM_ENABLED:
-        reply, directive = _keyword_fallback(user_msg, idx, kpis)
-    else:
-        system = DASHBOARD_SYSTEM.format(
-            artifacts_json=json.dumps(idx, indent=2),
-            kpis_json=json.dumps(kpis, indent=2) if kpis else "(no KPIs yet, run the pipeline first)",
-        )
-        msgs = [{"role": "system", "content": system}]
-        for entry in (history or [])[-6:]:
-            msgs.append(entry)
-        msgs.append({"role": "user", "content": user_msg})
-        try:
-            r = llm_client.chat_completion(
-                model=MODEL_NAME,
-                messages=msgs,
-                temperature=0.3,
-                max_tokens=600,
-                stream=False,
-            )
-            raw = (
-                r["choices"][0]["message"]["content"]
-                if isinstance(r, dict)
-                else r.choices[0].message.content
-            )
-            directive = _parse_display_directive(raw)
-            reply = _clean_response(raw)
-        except Exception as e:
-            reply = f"LLM error: {e}. Falling back to keyword matching."
-            reply_fb, directive = _keyword_fallback(user_msg, idx, kpis)
-            reply += "\n\n" + reply_fb
-    # Resolve artifacts — build interactive Plotly charts when possible
-    chart_out = None
-    tab_out = None
-    show = directive.get("show", "none")
-    fname = directive.get("filename", "")
-    chart_name = directive.get("chart", "")
-    # Interactive chart builders keyed by name
-    chart_builders = {
-        "sales": build_sales_chart,
-        "sentiment": build_sentiment_chart,
-        "top_sellers": build_top_sellers_chart,
-    }
-    if chart_name and chart_name in chart_builders:
-        chart_out = chart_builders[chart_name]()
-    elif show == "figure" and fname:
-        # Fallback: try to match filename to a chart builder
-        if "sales_trend" in fname:
-            chart_out = build_sales_chart()
-        elif "sentiment" in fname:
-            chart_out = build_sentiment_chart()
-        elif "arima" in fname or "forecast" in fname:
-            chart_out = build_sales_chart()  # closest interactive equivalent
-        else:
-            chart_out = _empty_chart(f"No interactive chart for {fname}")
-    if show == "table" and fname:
-        fp = PY_TAB_DIR / fname
-        if fp.exists():
-            tab_out = _load_table_safe(fp)
-        else:
-            reply += f"\n\n*(Could not find table: {fname})*"
-    new_history = (history or []) + [
-        {"role": "user", "content": user_msg},
-        {"role": "assistant", "content": reply},
-    ]
-    return new_history, "", chart_out, tab_out
-def _keyword_fallback(msg: str, idx: Dict, kpis: Dict) -> Tuple[str, Dict]:
-    """Simple keyword matcher when LLM is unavailable."""
-    msg_lower = msg.lower()
-    if not idx["python"]["figures"] and not idx["python"]["tables"]:
-        return (
-            "No artifacts found yet. Please run the pipeline first (Tab 1), "
-            "then come back here to explore the results.",
-            {"show": "none"},
-        )
-    kpi_text = ""
-    if kpis:
-        total = kpis.get("total_units_sold", 0)
-        kpi_text = (
-            f"Quick summary: **{kpis.get('n_titles', '?')}** book titles across "
-            f"**{kpis.get('n_months', '?')}** months, with **{total:,.0f}** total units sold."
-        )
-    if any(w in msg_lower for w in ["trend", "sales trend", "monthly sale"]):
-        return (
-            f"Here are the sales trends. {kpi_text}",
-            {"show": "figure", "chart": "sales"},
-        )
-    if any(w in msg_lower for w in ["sentiment", "review", "positive", "negative"]):
-        return (
-            f"Here is the sentiment distribution across sampled book titles. {kpi_text}",
-            {"show": "figure", "chart": "sentiment"},
-        )
-    if any(w in msg_lower for w in ["arima", "forecast", "predict"]):
-        return (
-            f"Here are the sales trends and forecasts. {kpi_text}",
-            {"show": "figure", "chart": "sales"},
-        )
-    if any(w in msg_lower for w in ["top", "best sell", "popular", "rank"]):
-        return (
-            f"Here are the top-selling titles by units sold. {kpi_text}",
-            {"show": "table", "scope": "python", "filename": "top_titles_by_units_sold.csv"},
-        )
-    if any(w in msg_lower for w in ["price", "pricing", "decision"]):
-        return (
-            f"Here are the pricing decisions. {kpi_text}",
-            {"show": "table", "scope": "python", "filename": "pricing_decisions.csv"},
-        )
-    if any(w in msg_lower for w in ["dashboard", "overview", "summary", "kpi"]):
-        return (
-            f"Dashboard overview: {kpi_text}\n\nAsk me about sales trends, sentiment, forecasts, "
-            "pricing, or top sellers to see specific visualizations.",
-            {"show": "table", "scope": "python", "filename": "df_dashboard.csv"},
-        )
-    # Default
     return (
-        f"I can show you various analyses. {kpi_text}\n\n"
-        "Try asking about: **sales trends**, **sentiment**, **ARIMA forecasts**, "
-        "**pricing decisions**, **top sellers**, or **dashboard overview**.",
-        {"show": "none"},
     )
 # =========================================================
-# KPI CARDS (BubbleBusters style)
 # =========================================================
-def render_kpi_cards() -> str:
-    kpis = load_kpis()
-    if not kpis:
         return (
-            '<div style="background:rgba(255,255,255,.65);backdrop-filter:blur(16px);'
-            'border-radius:20px;padding:28px;text-align:center;'
-            'border:1.5px solid rgba(255,255,255,.7);'
-            'box-shadow:0 8px 32px rgba(124,92,191,.08);">'
-            '<div style="font-size:36px;margin-bottom:10px;">📊</div>'
-            '<div style="color:#a48de8;font-size:14px;'
-            'font-weight:800;margin-bottom:6px;">No data yet</div>'
-            '<div style="color:#9d8fc4;font-size:12px;">'
-            'Run the pipeline to populate these cards.</div>'
-            '</div>'
         )
-    def card(icon, label, value, colour):
-        return f"""
-        <div style="background:rgba(255,255,255,.72);backdrop-filter:blur(16px);
-                    border-radius:20px;padding:18px 14px 16px;text-align:center;
-                    border:1.5px solid rgba(255,255,255,.8);
-                    box-shadow:0 4px 16px rgba(124,92,191,.08);
-                    border-top:3px solid {colour};">
-            <div style="font-size:26px;margin-bottom:7px;line-height:1;">{icon}</div>
-            <div style="color:#9d8fc4;font-size:9.5px;text-transform:uppercase;
-                        letter-spacing:1.8px;margin-bottom:7px;font-weight:800;">{label}</div>
-            <div style="color:#2d1f4e;font-size:16px;font-weight:800;">{value}</div>
-        </div>"""
-    kpi_config = [
-        ("n_titles",         "📚", "Book Titles",  "#a48de8"),
-        ("n_months",         "📅", "Time Periods", "#7aa6f8"),
-        ("total_units_sold", "📦", "Units Sold",   "#6ee7c7"),
-        ("total_revenue",    "💰", "Revenue",      "#3dcba8"),
-    ]
-    html = (
-        '<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));'
-        'gap:12px;margin-bottom:24px;">'
     )
-    for key, icon, label, colour in kpi_config:
-        val = kpis.get(key)
-        if val is None:
-            continue
-        if isinstance(val, (int, float)) and val > 100:
-            val = f"{val:,.0f}"
-        html += card(icon, label, str(val), colour)
-    # Extra KPIs not in config
-    known = {k for k, *_ in kpi_config}
-    for key, val in kpis.items():
-        if key not in known:
-            label = key.replace("_", " ").title()
-            if isinstance(val, (int, float)) and val > 100:
-                val = f"{val:,.0f}"
-            html += card("📈", label, str(val), "#8fa8f8")
-    html += "</div>"
-    return html
-# =========================================================
-# INTERACTIVE PLOTLY CHARTS (BubbleBusters style)
-# =========================================================
-CHART_PALETTE = ["#7c5cbf", "#2ec4a0", "#e8537a", "#e8a230", "#5e8fef",
-                 "#c45ea8", "#3dbacc", "#a0522d", "#6aaa3a", "#d46060"]
-def _styled_layout(**kwargs) -> dict:
-    defaults = dict(
-        template="plotly_white",
-        paper_bgcolor="rgba(255,255,255,0.95)",
-        plot_bgcolor="rgba(255,255,255,0.98)",
-        font=dict(family="system-ui, sans-serif", color="#2d1f4e", size=12),
-        margin=dict(l=60, r=20, t=70, b=70),
-        legend=dict(
-            orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
-            bgcolor="rgba(255,255,255,0.92)",
-            bordercolor="rgba(124,92,191,0.35)", borderwidth=1,
-        ),
-        title=dict(font=dict(size=15, color="#4b2d8a")),
-    )
-    defaults.update(kwargs)
-    return defaults
-def _empty_chart(title: str) -> go.Figure:
-    fig = go.Figure()
-    fig.update_layout(
-        title=title, height=420, template="plotly_white",
-        paper_bgcolor="rgba(255,255,255,0.95)",
-        annotations=[dict(text="Run the pipeline to generate data",
-            x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False,
-            font=dict(size=14, color="rgba(124,92,191,0.5)"))],
-    )
-    return fig
-def build_sales_chart() -> go.Figure:
-    path = PY_TAB_DIR / "df_dashboard.csv"
-    if not path.exists():
-        return _empty_chart("Sales Trends — run the pipeline first")
-    df = pd.read_csv(path)
-    date_col = next((c for c in df.columns if "month" in c.lower() or "date" in c.lower()), None)
-    val_cols = [c for c in df.columns if c != date_col and df[c].dtype in ("float64", "int64")]
-    if not date_col or not val_cols:
-        return _empty_chart("Could not auto-detect columns in df_dashboard.csv")
-    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
-    fig = go.Figure()
-    for i, col in enumerate(val_cols):
-        fig.add_trace(go.Scatter(
-            x=df[date_col], y=df[col], name=col.replace("_", " ").title(),
-            mode="lines+markers", line=dict(color=CHART_PALETTE[i % len(CHART_PALETTE)], width=2),
-            marker=dict(size=4),
-            hovertemplate=f"<b>{col.replace('_',' ').title()}</b><br>%{{x|%b %Y}}: %{{y:,.0f}}<extra></extra>",
-        ))
-    fig.update_layout(**_styled_layout(height=450, hovermode="x unified",
-                                        title=dict(text="Monthly Overview")))
-    fig.update_xaxes(gridcolor="rgba(124,92,191,0.15)", showgrid=True)
-    fig.update_yaxes(gridcolor="rgba(124,92,191,0.15)", showgrid=True)
-    return fig
-def build_sentiment_chart() -> go.Figure:
-    path = PY_TAB_DIR / "sentiment_counts_sampled.csv"
-    if not path.exists():
-        return _empty_chart("Sentiment Distribution — run the pipeline first")
-    df = pd.read_csv(path)
-    title_col = df.columns[0]
-    sent_cols = [c for c in ["negative", "neutral", "positive"] if c in df.columns]
-    if not sent_cols:
-        return _empty_chart("No sentiment columns found in CSV")
-    colors = {"negative": "#e8537a", "neutral": "#5e8fef", "positive": "#2ec4a0"}
-    fig = go.Figure()
-    for col in sent_cols:
-        fig.add_trace(go.Bar(
-            name=col.title(), y=df[title_col], x=df[col],
-            orientation="h", marker_color=colors.get(col, "#888"),
-            hovertemplate=f"<b>{col.title()}</b>: %{{x}}<extra></extra>",
-        ))
-    fig.update_layout(**_styled_layout(
-        height=max(400, len(df) * 28), barmode="stack",
-        title=dict(text="Sentiment Distribution by Book"),
-    ))
-    fig.update_xaxes(title="Number of Reviews")
-    fig.update_yaxes(autorange="reversed")
-    return fig
-def build_top_sellers_chart() -> go.Figure:
-    path = PY_TAB_DIR / "top_titles_by_units_sold.csv"
-    if not path.exists():
-        return _empty_chart("Top Sellers — run the pipeline first")
-    df = pd.read_csv(path).head(15)
-    title_col = next((c for c in df.columns if "title" in c.lower()), df.columns[0])
-    val_col = next((c for c in df.columns if "unit" in c.lower() or "sold" in c.lower()), df.columns[-1])
-    fig = go.Figure(go.Bar(
-        y=df[title_col], x=df[val_col], orientation="h",
-        marker=dict(color=df[val_col], colorscale=[[0, "#c5b4f0"], [1, "#7c5cbf"]]),
-        hovertemplate="<b>%{y}</b><br>Units: %{x:,.0f}<extra></extra>",
-    ))
-    fig.update_layout(**_styled_layout(
-        height=max(400, len(df) * 30),
-        title=dict(text="Top Selling Titles"), showlegend=False,
-    ))
-    fig.update_yaxes(autorange="reversed")
-    fig.update_xaxes(title="Total Units Sold")
-    return fig
-def refresh_dashboard():
-    return render_kpi_cards(), build_sales_chart(), build_sentiment_chart(), build_top_sellers_chart()
 # =========================================================
 # UI
 # =========================================================
-ensure_dirs()
-def load_css() -> str:
-    css_path = BASE_DIR / "style.css"
-    return css_path.read_text(encoding="utf-8") if css_path.exists() else ""
-with gr.Blocks(title="AIBDM 2026 Workshop App") as demo:
     gr.Markdown(
-        "# SE21 App Template\n"
-        "*This is an app template for SE21 students*",
         elem_id="escp_title",
     )
-    # ===========================================================
-    # TAB 1 -- Pipeline Runner
-    # ===========================================================
     with gr.Tab("Pipeline Runner"):
-        gr.Markdown()
-        with gr.Row():
-            with gr.Column(scale=1):
-                btn_nb1 = gr.Button("Step 1: Data Creation", variant="secondary")
-            with gr.Column(scale=1):
-                btn_nb2 = gr.Button("Step 2: Python Analysis", variant="secondary")
         with gr.Row():
-            btn_all = gr.Button("Run Full Pipeline (Both Steps)", variant="primary")
-        run_log = gr.Textbox(
-            label="Execution Log",
-            lines=18,
-            max_lines=30,
-            interactive=False,
-        )
-        btn_nb1.click(run_datacreation, outputs=[run_log])
-        btn_nb2.click(run_pythonanalysis, outputs=[run_log])
-        btn_all.click(run_full_pipeline, outputs=[run_log])
-    # ===========================================================
-    # TAB 2 -- Dashboard (KPIs + Interactive Charts + Gallery)
-    # ===========================================================
     with gr.Tab("Dashboard"):
-        kpi_html = gr.HTML(value=render_kpi_cards)
-        refresh_btn = gr.Button("Refresh Dashboard", variant="primary")
-        gr.Markdown("#### Interactive Charts")
-        chart_sales = gr.Plot(label="Monthly Overview")
-        chart_sentiment = gr.Plot(label="Sentiment Distribution")
-        chart_top = gr.Plot(label="Top Sellers")
-        gr.Markdown("#### Static Figures (from notebooks)")
-        gallery = gr.Gallery(
-            label="Generated Figures",
-            columns=2,
-            height=480,
-            object_fit="contain",
-        )
-        gr.Markdown("#### Data Tables")
-        table_dropdown = gr.Dropdown(
-            label="Select a table to view",
-            choices=[],
-            interactive=True,
-        )
-        table_display = gr.Dataframe(
-            label="Table Preview",
-            interactive=False,
-        )
-        def _on_refresh():
-            kpi, c1, c2, c3 = refresh_dashboard()
-            figs, dd, df = refresh_gallery()
-            return kpi, c1, c2, c3, figs, dd, df
-        refresh_btn.click(
-            _on_refresh,
-            outputs=[kpi_html, chart_sales, chart_sentiment, chart_top,
-                     gallery, table_dropdown, table_display],
-        )
-        table_dropdown.change(
-            on_table_select,
-            inputs=[table_dropdown],
-            outputs=[table_display],
-        )
-    # ===========================================================
-    # TAB 3 -- AI Dashboard
-    # ===========================================================
     with gr.Tab('"AI" Dashboard'):
-        _ai_status = (
-            "Connected to your **n8n workflow**." if N8N_WEBHOOK_URL
-            else "**LLM active.**" if LLM_ENABLED
-            else "Using **keyword matching**. Upgrade options: "
-                 "set `N8N_WEBHOOK_URL` to connect your n8n workflow, "
-                 "or set `HF_API_KEY` for direct LLM access."
         )
         gr.Markdown(
-            "### Ask questions, get interactive visualisations\n\n"
-            f"Type a question and the system will pick the right interactive chart or table. {_ai_status}"
         )
-        with gr.Row(equal_height=True):
-            with gr.Column(scale=1):
-                chatbot = gr.Chatbot(
-                    label="Conversation",
-                    height=380,
-                )
-                user_input = gr.Textbox(
-                    label="Ask about your data",
-                    placeholder="e.g. Show me sales trends / What are the top sellers? / Sentiment analysis",
-                    lines=1,
-                )
-                gr.Examples(
-                    examples=[
-                        "Show me the sales trends",
-                        "What does the sentiment look like?",
-                        "Which titles sell the most?",
-                        "Show the ARIMA forecasts",
-                        "What are the pricing decisions?",
-                        "Give me a dashboard overview",
-                    ],
-                    inputs=user_input,
-                )
-            with gr.Column(scale=1):
-                ai_figure = gr.Plot(
-                    label="Interactive Chart",
-                )
-                ai_table = gr.Dataframe(
-                    label="Data Table",
-                    interactive=False,
-                )
-        user_input.submit(
-            ai_chat,
-            inputs=[user_input, chatbot],
-            outputs=[chatbot, user_input, ai_figure, ai_table],
         )
-demo.launch(css=load_css(), allowed_paths=[str(BASE_DIR)])

 import os
 import re
 import json
 from pathlib import Path
+from collections import Counter
 import pandas as pd
 import gradio as gr
+import plotly.express as px
 import plotly.graph_objects as go
+# Optional LLM support
 try:
     from huggingface_hub import InferenceClient
 except Exception:
 # =========================================================
 BASE_DIR = Path(__file__).resolve().parent
 HF_API_KEY = os.environ.get("HF_API_KEY", "").strip()
+MODEL_NAME = os.environ.get("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct").strip()
 N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip()
 LLM_ENABLED = bool(HF_API_KEY) and InferenceClient is not None
+llm_client = InferenceClient(api_key=HF_API_KEY) if LLM_ENABLED else None
+MAX_PREVIEW_ROWS = 15
+IGNORE_SHEETS = {"data_dictionary", "sources", "source", "readme", "metadata"}
+POSITIVE_WORDS = {
+    "great", "excellent", "amazing", "wonderful", "perfect", "pleasant", "friendly",
+    "clean", "comfortable", "beautiful", "fantastic", "helpful", "enjoyed", "loved",
+    "luxurious", "smooth", "spacious", "professional", "quiet", "impressive"
+}
+NEGATIVE_WORDS = {
+    "bad", "poor", "terrible", "awful", "dirty", "slow", "rude", "noisy", "expensive",
+    "disappointing", "uncomfortable", "broken", "worst", "late", "smell", "smelly",
+    "small", "crowded", "issue", "problem", "delay", "unhelpful", "overpriced"
+}
+THEME_KEYWORDS = {
+    "cleanliness": ["clean", "dirty", "smell", "smelly", "hygiene", "stain", "dust"],
+    "staff": ["staff", "service", "reception", "manager", "employee", "friendly", "rude", "helpful"],
+    "check_in": ["check-in", "check in", "queue", "waiting", "late", "front desk"],
+    "room_comfort": ["bed", "pillow", "comfortable", "room", "sleep", "spacious", "small"],
+    "noise": ["noise", "noisy", "loud", "street", "neighbors"],
+    "breakfast_food": ["breakfast", "food", "restaurant", "buffet", "dinner"],
+    "location": ["location", "near", "far", "transport", "view", "airport"],
+    "value_price": ["price", "expensive", "cheap", "value", "worth", "overpriced"]
+}
 # =========================================================
 # HELPERS
 # =========================================================
+def load_css() -> str:
+    css_path = BASE_DIR / "style.css"
+    return css_path.read_text(encoding="utf-8") if css_path.exists() else ""
+def normalize_columns(columns):
+    clean = []
+    for col in columns:
+        c = str(col).strip().lower()
+        c = re.sub(r"[^\w\s]", "", c)
+        c = re.sub(r"\s+", "_", c)
+        clean.append(c)
+    return clean
+def format_num(x):
+    if x is None or pd.isna(x):
+        return "N/A"
+    if isinstance(x, (int, float)):
+        if abs(x) >= 1000:
+            return f"{x:,.0f}"
+        return f"{x:.2f}"
+    return str(x)
+def format_pct(x):
+    if x is None or pd.isna(x):
+        return "N/A"
+    return f"{x * 100:.1f}%"
+def empty_figure(title: str, message: str = "No data available yet") -> go.Figure:
+    fig = go.Figure()
+    fig.update_layout(
+        title=title,
+        template="plotly_white",
+        paper_bgcolor="rgba(255,255,255,0.95)",
+        plot_bgcolor="rgba(255,255,255,0.98)",
+        height=420,
+        annotations=[
+            dict(
+                text=message,
+                x=0.5,
+                y=0.5,
+                xref="paper",
+                yref="paper",
+                showarrow=False,
+                font=dict(size=15, color="rgba(53,32,138,0.65)")
+            )
+        ]
+    )
+    return fig
+def coerce_numeric(series: pd.Series) -> pd.Series:
+    return pd.to_numeric(series, errors="coerce")
+def normalize_rate(series: pd.Series) -> pd.Series:
+    s = coerce_numeric(series)
+    if s.dropna().empty:
+        return s
+    if s.max() > 1.5:
+        s = s / 100.0
+    return s
+def find_first_column(df: pd.DataFrame, candidates):
+    for c in candidates:
+        if c in df.columns:
+            return c
+    return None
+def pick_primary_sheet(file_path: str) -> pd.DataFrame:
+    excel = pd.ExcelFile(file_path)
+    sheet_names = excel.sheet_names
+    valid_sheets = [s for s in sheet_names if s.strip().lower() not in IGNORE_SHEETS]
+    chosen = valid_sheets[0] if valid_sheets else sheet_names[0]
+    df = pd.read_excel(file_path, sheet_name=chosen)
+    df.columns = normalize_columns(df.columns)
+    return df
+def read_uploaded_excel(file_obj):
+    if file_obj is None:
+        return None
+    path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
+    return pick_primary_sheet(path)
+def clip_text(text, n=220):
+    text = str(text) if text is not None else ""
+    text = re.sub(r"\s+", " ", text).strip()
+    return text if len(text) <= n else text[: n - 3] + "..."
+def simple_sentiment_score(text: str) -> float:
+    if not text:
+        return 0.0
+    words = re.findall(r"[a-zA-Z']+", str(text).lower())
+    if not words:
+        return 0.0
+    pos = sum(1 for w in words if w in POSITIVE_WORDS)
+    neg = sum(1 for w in words if w in NEGATIVE_WORDS)
+    return (pos - neg) / max(len(words), 8)
+def sentiment_label_from_score(score: float) -> str:
+    if score >= 0.03:
+        return "positive"
+    if score <= -0.03:
+        return "negative"
+    return "neutral"
+def detect_themes(text: str):
+    text_lower = str(text).lower()
+    matches = []
+    for theme, keywords in THEME_KEYWORDS.items():
+        if any(k in text_lower for k in keywords):
+            matches.append(theme)
+    return matches if matches else ["general"]
 # =========================================================
+# REVIEW ANALYSIS
 # =========================================================
+def analyze_reviews(df: pd.DataFrame):
+    work = df.copy()
+    text_col = find_first_column(work, ["review_text", "text", "review", "content"])
+    title_col = find_first_column(work, ["review_title", "title", "headline"])
+    rating_col = find_first_column(work, ["rating", "score", "stars", "review_score"])
+    city_col = find_first_column(work, ["city", "location_city"])
+    hotel_col = find_first_column(work, ["hotel_name", "name", "hotel"])
+    date_col = find_first_column(work, ["review_date", "date", "stay_date"])
+    sentiment_col = find_first_column(work, ["sentiment_score"])
+    theme_col = find_first_column(work, ["detected_theme", "theme"])
+    if text_col is None and title_col is None:
+        raise gr.Error("The real reviews file needs at least a review text or review title column.")
+    if text_col is None:
+        work["review_text"] = work[title_col].fillna("").astype(str)
+        text_col = "review_text"
+    if title_col and title_col != text_col:
+        work["combined_text"] = (
+            work[title_col].fillna("").astype(str) + " " + work[text_col].fillna("").astype(str)
+        ).str.strip()
+    else:
+        work["combined_text"] = work[text_col].fillna("").astype(str)
+    if rating_col:
+        work["rating_num"] = coerce_numeric(work[rating_col])
+    else:
+        work["rating_num"] = pd.NA
+    if sentiment_col:
+        work["sentiment_score_calc"] = coerce_numeric(work[sentiment_col]).fillna(0.0)
+    else:
+        work["sentiment_score_calc"] = work["combined_text"].apply(simple_sentiment_score)
+    work["sentiment_label_calc"] = work["sentiment_score_calc"].apply(sentiment_label_from_score)
+    if theme_col:
+        work["theme_list_calc"] = work[theme_col].fillna("general").astype(str).apply(
+            lambda x: [t.strip() for t in str(x).split(",") if t.strip()]
         )
+    else:
+        work["theme_list_calc"] = work["combined_text"].apply(detect_themes)
+    work["primary_theme"] = work["theme_list_calc"].apply(lambda x: x[0] if x else "general")
+    if city_col is None:
+        work["city"] = "Unknown"
+        city_col = "city"
+    if hotel_col is None:
+        work["hotel_name"] = "Unknown"
+        hotel_col = "hotel_name"
+    sentiment_counts = (
+        work["sentiment_label_calc"]
+        .value_counts()
+        .reindex(["positive", "neutral", "negative"], fill_value=0)
+        .to_dict()
     )
+    avg_rating = work["rating_num"].mean() if "rating_num" in work else None
+    pos_themes = Counter()
+    neg_themes = Counter()
+    all_themes = Counter()
+    for _, row in work.iterrows():
+        themes = row["theme_list_calc"]
+        for t in themes:
+            all_themes[t] += 1
+            if row["sentiment_label_calc"] == "positive":
+                pos_themes[t] += 1
+            elif row["sentiment_label_calc"] == "negative":
+                neg_themes[t] += 1
+    pos_df = work[work["sentiment_label_calc"] == "positive"].copy()
+    neg_df = work[work["sentiment_label_calc"] == "negative"].copy()
+    pos_example = ""
+    neg_example = ""
+    if not pos_df.empty:
+        pos_df = pos_df.sort_values(["sentiment_score_calc", "rating_num"], ascending=[False, False], na_position="last")
+        pos_example = clip_text(pos_df.iloc[0]["combined_text"])
+    if not neg_df.empty:
+        neg_df = neg_df.sort_values(["sentiment_score_calc", "rating_num"], ascending=[True, True], na_position="last")
+        neg_example = clip_text(neg_df.iloc[0]["combined_text"])
+    rating_city = pd.DataFrame()
+    if city_col in work.columns and "rating_num" in work.columns and work["rating_num"].notna().any():
+        rating_city = (
+            work.groupby(city_col, dropna=False)
+            .agg(review_count=("combined_text", "count"), average_rating=("rating_num", "mean"))
+            .reset_index()
+            .sort_values("average_rating", ascending=False)
+        )
+    summary = {
+        "review_count": int(len(work)),
+        "avg_rating": float(avg_rating) if pd.notna(avg_rating) else None,
+        "sentiment_counts": sentiment_counts,
+        "top_themes": dict(all_themes.most_common(8)),
+        "top_positive_themes": dict(pos_themes.most_common(5)),
+        "top_negative_themes": dict(neg_themes.most_common(5)),
+        "positive_example": pos_example,
+        "negative_example": neg_example,
+        "city_table": rating_city,
+        "clean_df": work[[c for c in [
+            hotel_col, city_col, date_col, "combined_text", "rating_num",
+            "sentiment_score_calc", "sentiment_label_calc", "primary_theme"
+        ] if c is not None and c in work.columns]].head(MAX_PREVIEW_ROWS),
+        "full_df": work,
+    }
+    return summary
 # =========================================================
+# BUSINESS ANALYSIS
 # =========================================================
+def analyze_business(df: pd.DataFrame):
+    work = df.copy()
+    city_col = find_first_column(work, ["city"])
+    hotel_col = find_first_column(work, ["hotel_name", "name", "hotel"])
+    room_col = find_first_column(work, ["room_type", "room_category"])
+    date_col = find_first_column(work, ["date", "week", "month"])
+    price_col = find_first_column(work, ["nightly_price", "price", "avg_daily_rate"])
+    occ_col = find_first_column(work, ["occupancy_rate", "occupancy"])
+    cancel_col = find_first_column(work, ["cancellation_rate", "cancellations"])
+    revenue_col = find_first_column(work, ["revenue"])
+    rooms_booked_col = find_first_column(work, ["rooms_booked", "bookings"])
+    if city_col is None:
+        work["city"] = "Unknown"
+        city_col = "city"
+    if hotel_col is None:
+        work["hotel_name"] = "Unknown"
+        hotel_col = "hotel_name"
+    if room_col is None:
+        work["room_type"] = "Unknown"
+        room_col = "room_type"
+    if price_col:
+        work["nightly_price_num"] = coerce_numeric(work[price_col])
+    else:
+        work["nightly_price_num"] = pd.NA
+    if occ_col:
+        work["occupancy_rate_num"] = normalize_rate(work[occ_col])
+    else:
+        work["occupancy_rate_num"] = pd.NA
+    if cancel_col:
+        work["cancellation_rate_num"] = normalize_rate(work[cancel_col])
+    else:
+        work["cancellation_rate_num"] = pd.NA
+    if revenue_col:
+        work["revenue_num"] = coerce_numeric(work[revenue_col])
+    elif price_col and rooms_booked_col:
+        work["revenue_num"] = coerce_numeric(work[price_col]) * coerce_numeric(work[rooms_booked_col])
+    else:
+        work["revenue_num"] = pd.NA
+    if date_col:
+        work["date_num"] = pd.to_datetime(work[date_col], errors="coerce")
+    else:
+        work["date_num"] = pd.NaT
+    summary = {
+        "avg_price": float(work["nightly_price_num"].mean()) if work["nightly_price_num"].notna().any() else None,
+        "avg_occupancy": float(work["occupancy_rate_num"].mean()) if work["occupancy_rate_num"].notna().any() else None,
+        "avg_cancellation": float(work["cancellation_rate_num"].mean()) if work["cancellation_rate_num"].notna().any() else None,
+        "total_revenue": float(work["revenue_num"].sum()) if work["revenue_num"].notna().any() else None,
+        "row_count": int(len(work)),
+        "clean_df": work.head(MAX_PREVIEW_ROWS),
+        "full_df": work,
+    }
+    return summary
+# =========================================================
+# PRICING LOGIC
+# =========================================================
+def most_common_negative_theme(series_of_lists):
+    counter = Counter()
+    for item in series_of_lists:
+        if isinstance(item, list):
+            for t in item:
+                counter[t] += 1
+    return counter.most_common(1)[0][0] if counter else "general"
+def build_pricing_recommendations(review_summary, business_summary):
+    review_df = review_summary["full_df"].copy()
+    business_df = business_summary["full_df"].copy()
+    join_keys = [k for k in ["city", "hotel_name"] if k in review_df.columns and k in business_df.columns]
+    if not join_keys:
+        review_df["portfolio"] = "All Hotels"
+        business_df["portfolio"] = "All Hotels"
+        join_keys = ["portfolio"]
+    review_df["negative_flag"] = (review_df["sentiment_label_calc"] == "negative").astype(int)
+    review_group = (
+        review_df.groupby(join_keys, dropna=False)
+        .agg(
+            avg_rating=("rating_num", "mean"),
+            avg_sentiment=("sentiment_score_calc", "mean"),
+            negative_share=("negative_flag", "mean"),
+            review_count=("combined_text", "count"),
+        )
+        .reset_index()
+    )
+    theme_group = (
+        review_df.groupby(join_keys, dropna=False)["theme_list_calc"]
+        .apply(most_common_negative_theme)
+        .reset_index(name="priority_issue")
+    )
+    review_group = review_group.merge(theme_group, on=join_keys, how="left")
+    business_group = business_df.groupby(join_keys, dropna=False).agg(
+        avg_price=("nightly_price_num", "mean"),
+        avg_occupancy=("occupancy_rate_num", "mean"),
+        avg_cancellation=("cancellation_rate_num", "mean"),
+        total_revenue=("revenue_num", "sum"),
+    ).reset_index()
+    merged = business_group.merge(review_group, on=join_keys, how="left")
+    def decide(row):
+        occ = row.get("avg_occupancy")
+        sent = row.get("avg_sentiment")
+        neg_share = row.get("negative_share")
+        cancel = row.get("avg_cancellation")
+        occ = occ if pd.notna(occ) else None
+        sent = sent if pd.notna(sent) else None
+        neg_share = neg_share if pd.notna(neg_share) else None
+        cancel = cancel if pd.notna(cancel) else None
+        if occ is not None and sent is not None and cancel is not None:
+            if occ >= 0.80 and sent >= 0.03 and cancel <= 0.15:
+                return "Raise price", "Strong demand and healthy guest perception support a measured increase."
+            if occ >= 0.60 and sent >= 0.0 and cancel <= 0.22:
+                return "Hold price", "Performance is stable. Maintain price and continue monitoring service quality."
+            if sent < 0.0 or (neg_share is not None and neg_share > 0.35) or cancel > 0.25:
+                return "Lower price / fix service", "Guest perception or cancellations are too weak to support a higher price."
+            return "Hold and monitor", "Signals are mixed. Avoid aggressive changes until performance stabilises."
+        if sent is not None:
+            if sent >= 0.03:
+                return "Hold or test small increase", "Sentiment is supportive, but operational data is incomplete."
+            if sent < 0:
+                return "Avoid increase", "Sentiment is weak, so price increases would be risky."
+        return "Insufficient data", "More pricing or occupancy data is needed for a confident decision."
+    actions = merged.apply(lambda row: decide(row), axis=1)
+    merged["pricing_action"] = actions.apply(lambda x: x[0])
+    merged["rationale"] = actions.apply(lambda x: x[1])
+    preferred_cols = join_keys + [
+        "avg_price", "avg_occupancy", "avg_cancellation", "total_revenue",
+        "avg_rating", "avg_sentiment", "negative_share", "priority_issue",
+        "pricing_action", "rationale"
+    ]
+    preferred_cols = [c for c in preferred_cols if c in merged.columns]
+    sort_cols = [c for c in ["avg_occupancy", "avg_sentiment"] if c in merged.columns]
+    if sort_cols:
+        merged = merged[preferred_cols].sort_values(by=sort_cols, ascending=False)
+    else:
+        merged = merged[preferred_cols]
+    return merged
+# =========================================================
+# CHARTS
+# =========================================================
+def chart_sentiment_distribution(review_summary):
+    counts = review_summary["sentiment_counts"]
+    df = pd.DataFrame({
+        "sentiment": list(counts.keys()),
+        "count": list(counts.values())
+    })
+    if df["count"].sum() == 0:
+        return empty_figure("Review Sentiment Distribution")
+    fig = px.bar(
+        df,
+        x="sentiment",
+        y="count",
+        color="sentiment",
+        color_discrete_map={
+            "positive": "#2fbf9f",
+            "neutral": "#f2b138",
+            "negative": "#e05b77",
+        },
+        title="Review Sentiment Distribution"
+    )
+    fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420, showlegend=False)
+    return fig
+def chart_top_themes(review_summary):
+    top_themes = review_summary["top_themes"]
+    if not top_themes:
+        return empty_figure("Top Review Themes")
+    df = pd.DataFrame({
+        "theme": list(top_themes.keys()),
+        "count": list(top_themes.values())
+    }).sort_values("count", ascending=True)
+    fig = px.bar(
+        df,
+        x="count",
+        y="theme",
+        orientation="h",
+        title="Top Review Themes",
+        color="count",
+        color_continuous_scale=["#c9bdf5", "#5f44cc"]
+    )
+    fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420)
+    return fig
+def chart_rating_by_city(review_summary):
+    city_table = review_summary["city_table"]
+    if city_table is None or city_table.empty:
+        return empty_figure("Average Rating by City", "City and rating data not available")
+    city_col = city_table.columns[0]
+    fig = px.bar(
+        city_table.sort_values("average_rating", ascending=False),
+        x=city_col,
+        y="average_rating",
+        title="Average Rating by City",
+        color="average_rating",
+        color_continuous_scale=["#f2d57d", "#35208a"]
+    )
+    fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420)
+    return fig
+def chart_price_by_city(business_summary):
+    df = business_summary["full_df"].copy()
+    if "city" not in df.columns or "nightly_price_num" not in df.columns or df["nightly_price_num"].notna().sum() == 0:
+        return empty_figure("Average Nightly Price by City", "Pricing data not available")
+    chart_df = df.groupby("city", dropna=False)["nightly_price_num"].mean().reset_index()
+    fig = px.bar(
+        chart_df.sort_values("nightly_price_num", ascending=False),
+        x="city",
+        y="nightly_price_num",
+        title="Average Nightly Price by City",
+        color="nightly_price_num",
+        color_continuous_scale=["#d5cdf8", "#35208a"]
+    )
+    fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420)
+    return fig
+def chart_occupancy_by_room_type(business_summary):
+    df = business_summary["full_df"].copy()
+    if "room_type" not in df.columns or "occupancy_rate_num" not in df.columns or df["occupancy_rate_num"].notna().sum() == 0:
+        return empty_figure("Occupancy by Room Type", "Occupancy data not available")
+    chart_df = df.groupby("room_type", dropna=False)["occupancy_rate_num"].mean().reset_index()
+    fig = px.bar(
+        chart_df.sort_values("occupancy_rate_num", ascending=False),
+        x="room_type",
+        y="occupancy_rate_num",
+        title="Average Occupancy by Room Type",
+        color="occupancy_rate_num",
+        color_continuous_scale=["#d2f2ea", "#2fbf9f"]
+    )
+    fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420)
+    fig.update_yaxes(tickformat=".0%")
+    return fig
+def chart_revenue_by_city(business_summary):
+    df = business_summary["full_df"].copy()
+    if "city" not in df.columns or "revenue_num" not in df.columns or df["revenue_num"].notna().sum() == 0:
+        return empty_figure("Revenue by City", "Revenue data not available")
+    chart_df = df.groupby("city", dropna=False)["revenue_num"].sum().reset_index()
+    fig = px.bar(
+        chart_df.sort_values("revenue_num", ascending=False),
+        x="city",
+        y="revenue_num",
+        title="Total Revenue by City",
+        color="revenue_num",
+        color_continuous_scale=["#f6d39a", "#f2b138"]
+    )
+    fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420)
+    return fig
+# =========================================================
+# TEXT OUTPUTS
+# =========================================================
+def build_kpi_cards(review_summary, business_summary, pricing_df):
+    cards = []
+    cards.append(("Reviews", format_num(review_summary["review_count"])))
+    cards.append(("Avg Rating", format_num(review_summary["avg_rating"])))
+    cards.append(("Avg Nightly Price", format_num(business_summary["avg_price"])))
+    cards.append(("Avg Occupancy", format_pct(business_summary["avg_occupancy"])))
+    cards.append(("Avg Cancellation", format_pct(business_summary["avg_cancellation"])))
+    cards.append(("Total Revenue", format_num(business_summary["total_revenue"])))
+    raise_count = 0
+    if not pricing_df.empty and "pricing_action" in pricing_df.columns:
+        raise_count = int((pricing_df["pricing_action"] == "Raise price").sum())
+    cards.append(("Raise-Price Opportunities", format_num(raise_count)))
+    html = '<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(150px,1fr));gap:12px;margin-bottom:16px;">'
+    for label, value in cards:
+        html += f"""
+        <div style="
+            background:rgba(255,255,255,0.80);
+            border-radius:18px;
+            padding:18px;
+            border:1.5px solid rgba(255,255,255,0.80);
+            box-shadow:0 8px 24px rgba(53,32,138,0.08);
+            text-align:center;
+        ">
+            <div style="font-size:12px;font-weight:800;letter-spacing:1px;text-transform:uppercase;color:#6f5cb5;margin-bottom:8px;">{label}</div>
+            <div style="font-size:22px;font-weight:900;color:#24115e;">{value}</div>
+        </div>
+        """
+    html += "</div>"
+    return html
+def build_review_summary_md(review_summary):
+    sentiments = review_summary["sentiment_counts"]
+    top_negative = ", ".join(list(review_summary["top_negative_themes"].keys())[:3]) or "none detected"
+    top_positive = ", ".join(list(review_summary["top_positive_themes"].keys())[:3]) or "none detected"
+    md = f"""
+### Review Insights Summary
+- **Total reviews analysed:** {review_summary['review_count']}
+- **Average rating:** {format_num(review_summary['avg_rating'])}
+- **Positive reviews:** {sentiments.get('positive', 0)}
+- **Neutral reviews:** {sentiments.get('neutral', 0)}
+- **Negative reviews:** {sentiments.get('negative', 0)}
+- **Top positive themes:** {top_positive}
+- **Top complaint themes:** {top_negative}
+**Strong positive example:**
+{review_summary['positive_example'] or 'No clear positive example found.'}
+**Strong negative example:**
+{review_summary['negative_example'] or 'No clear negative example found.'}
+"""
+    return md
+def build_business_summary_md(business_summary, pricing_df):
+    action_counts = {}
+    if not pricing_df.empty and "pricing_action" in pricing_df.columns:
+        action_counts = pricing_df["pricing_action"].value_counts().to_dict()
+    md = f"""
+### Pricing and Business Summary
+- **Rows analysed in synthetic/business data:** {business_summary['row_count']}
+- **Average nightly price:** {format_num(business_summary['avg_price'])}
+- **Average occupancy rate:** {format_pct(business_summary['avg_occupancy'])}
+- **Average cancellation rate:** {format_pct(business_summary['avg_cancellation'])}
+- **Total revenue:** {format_num(business_summary['total_revenue'])}
+**Pricing decision counts**
+- Raise price: {action_counts.get('Raise price', 0)}
+- Hold price: {action_counts.get('Hold price', 0)}
+- Hold and monitor: {action_counts.get('Hold and monitor', 0)}
+- Lower price / fix service: {action_counts.get('Lower price / fix service', 0)}
+- Hold or test small increase: {action_counts.get('Hold or test small increase', 0)}
+- Avoid increase: {action_counts.get('Avoid increase', 0)}
+- Insufficient data: {action_counts.get('Insufficient data', 0)}
+This dashboard is designed as a **case-study decision tool** for hotel management, not just a chart viewer.
+"""
+    return md
+def build_execution_log(review_df, business_df, pricing_df):
+    review_cols = ", ".join(review_df.columns[:12])
+    business_cols = ", ".join(business_df.columns[:12])
+    log = f"""PROJECT PIPELINE COMPLETED
+Step 1 - Real-world review file loaded successfully
+Rows: {len(review_df)}
+Columns detected: {review_cols}
+Step 2 - Synthetic/business file loaded successfully
+Rows: {len(business_df)}
+Columns detected: {business_cols}
+Step 3 - Review sentiment and theme analysis completed
+Step 4 - Business KPI analysis completed
+Step 5 - Pricing optimisation logic completed
+Recommendation rows generated: {len(pricing_df)}
+Status:
+- Qualitative analysis: ready
+- Quantitative analysis: ready
+- Pricing recommendations: ready
+- AI assistant context: ready
+"""
+    return log
+# =========================================================
+# MAIN PIPELINE
+# =========================================================
+def run_pipeline(real_file, synthetic_file):
+    if real_file is None or synthetic_file is None:
+        raise gr.Error("Please upload both Excel files before running the analysis.")
+    real_df = read_uploaded_excel(real_file)
+    synthetic_df = read_uploaded_excel(synthetic_file)
+    if real_df is None or synthetic_df is None:
+        raise gr.Error("Could not read one of the Excel files.")
+    review_summary = analyze_reviews(real_df)
+    business_summary = analyze_business(synthetic_df)
+    pricing_df = build_pricing_recommendations(review_summary, business_summary)
+    kpi_html = build_kpi_cards(review_summary, business_summary, pricing_df)
+    review_md = build_review_summary_md(review_summary)
+    business_md = build_business_summary_md(business_summary, pricing_df)
+    log_text = build_execution_log(real_df, synthetic_df, pricing_df)
+    analysis_state = {
+        "review_summary_text": review_md,
+        "business_summary_text": business_md,
+        "review_count": review_summary["review_count"],
+        "avg_rating": review_summary["avg_rating"],
+        "avg_price": business_summary["avg_price"],
+        "avg_occupancy": business_summary["avg_occupancy"],
+        "avg_cancellation": business_summary["avg_cancellation"],
+        "total_revenue": business_summary["total_revenue"],
+        "top_negative_themes": review_summary["top_negative_themes"],
+        "top_positive_themes": review_summary["top_positive_themes"],
+        "pricing_table": pricing_df.head(20).to_dict(orient="records"),
+    }
     return (
+        log_text,
+        review_summary["clean_df"],
+        business_summary["clean_df"],
+        kpi_html,
+        review_md,
+        business_md,
+        chart_sentiment_distribution(review_summary),
+        chart_top_themes(review_summary),
+        chart_rating_by_city(review_summary),
+        chart_price_by_city(business_summary),
+        chart_occupancy_by_room_type(business_summary),
+        chart_revenue_by_city(business_summary),
+        pricing_df.head(20),
+        analysis_state,
     )
 # =========================================================
+# AI ASSISTANT
 # =========================================================
+def keyword_ai_reply(question: str, analysis_state: dict) -> str:
+    q = question.lower()
+    if not analysis_state:
+        return "Please run the analysis first in the Pipeline Runner tab."
+    top_neg = analysis_state.get("top_negative_themes", {})
+    top_pos = analysis_state.get("top_positive_themes", {})
+    pricing_table = analysis_state.get("pricing_table", [])
+    if "complaint" in q or "negative" in q or "problem" in q:
+        if top_neg:
+            top_items = ", ".join([f"{k} ({v})" for k, v in list(top_neg.items())[:3]])
+            return f"The main guest complaint themes are: {top_items}. These issues are likely weakening pricing power."
+        return "No strong complaint pattern was detected."
+    if "positive" in q or "praise" in q or "strength" in q:
+        if top_pos:
+            top_items = ", ".join([f"{k} ({v})" for k, v in list(top_pos.items())[:3]])
+            return f"The strongest positive themes are: {top_items}. These are service strengths the hotel can protect and highlight."
+        return "No strong praise pattern was detected."
+    if "occupancy" in q:
+        return f"The average occupancy rate in the uploaded synthetic/business dataset is {format_pct(analysis_state.get('avg_occupancy'))}."
+    if "cancel" in q:
+        return f"The average cancellation rate is {format_pct(analysis_state.get('avg_cancellation'))}. Higher cancellations make aggressive pricing riskier."
+    if "price" in q or "pricing" in q:
+        if pricing_table:
+            top = pricing_table[0]
+            location_bits = []
+            for key in ["hotel_name", "city", "portfolio"]:
+                if key in top:
+                    location_bits.append(str(top[key]))
+            where = " / ".join(location_bits) if location_bits else "the top segment"
+            action = top.get("pricing_action", "review the segment")
+            rationale = top.get("rationale", "")
+            return f"The strongest current pricing recommendation is **{action}** for **{where}**. Reason: {rationale}"
+        return "I do not have pricing recommendations yet. Please run the analysis first."
+    if "summary" in q or "overview" in q:
         return (
+            f"Overview: {analysis_state.get('review_count', 'N/A')} reviews were analysed with an average rating of "
+            f"{format_num(analysis_state.get('avg_rating'))}. The synthetic/business dataset shows an average nightly price of "
+            f"{format_num(analysis_state.get('avg_price'))}, average occupancy of {format_pct(analysis_state.get('avg_occupancy'))}, "
+            f"and average cancellation of {format_pct(analysis_state.get('avg_cancellation'))}."
         )
+    return (
+        "I can answer questions about complaints, positive themes, pricing, occupancy, cancellations, and overall summary. "
+        "Try asking: 'What are the main complaints?' or 'Where should prices be raised?'"
     )
+def build_llm_prompt(question: str, analysis_state: dict) -> str:
+    return f"""
+You are an AI hotel pricing analyst. Answer briefly and clearly in business language.
+Project context:
+- Goal: optimise hotel room pricing while protecting guest satisfaction.
+- This app uses real review data plus synthetic/business data.
+- The output should feel like a consulting-style case study.
+Review summary:
+{analysis_state.get('review_summary_text', '')}
+Business summary:
+{analysis_state.get('business_summary_text', '')}
+Top pricing rows:
+{json.dumps(analysis_state.get('pricing_table', [])[:5], indent=2)}
+User question:
+{question}
+Instructions:
+- Give a direct answer.
+- Mention pricing implications when relevant.
+- Be concise.
+"""
+def call_n8n(question: str, analysis_state: dict):
+    if not N8N_WEBHOOK_URL:
+        return None
+    try:
+        import requests
+        payload = {
+            "question": question,
+            "analysis_state": analysis_state,
+        }
+        response = requests.post(N8N_WEBHOOK_URL, json=payload, timeout=25)
+        response.raise_for_status()
+        data = response.json()
+        return data.get("answer", "n8n responded but did not return an answer field.")
+    except Exception as e:
+        return f"n8n connection error: {e}"
+def ask_ai(question, history, analysis_state):
+    if not question or not question.strip():
+        return history, ""
+    history = history or []
+    if not analysis_state:
+        answer = "Please upload both files and run the analysis first in the Pipeline Runner tab."
+    else:
+        n8n_answer = call_n8n(question, analysis_state)
+        if n8n_answer:
+            answer = n8n_answer
+        elif LLM_ENABLED:
+            try:
+                prompt = build_llm_prompt(question, analysis_state)
+                completion = llm_client.chat_completion(
+                    model=MODEL_NAME,
+                    messages=[
+                        {"role": "system", "content": "You are a concise hotel pricing analyst."},
+                        {"role": "user", "content": prompt},
+                    ],
+                    max_tokens=350,
+                    temperature=0.2,
+                )
+                if isinstance(completion, dict):
+                    answer = completion["choices"][0]["message"]["content"]
+                else:
+                    answer = completion.choices[0].message.content
+            except Exception as e:
+                answer = f"LLM error: {e}. Falling back to built-in assistant.\n\n" + keyword_ai_reply(question, analysis_state)
+        else:
+            answer = keyword_ai_reply(question, analysis_state)
+    history = history + [
+        {"role": "user", "content": question},
+        {"role": "assistant", "content": answer},
+    ]
+    return history, ""
 # =========================================================
 # UI
 # =========================================================
+placeholder_kpis = """
+<div style="background:rgba(255,255,255,0.78);padding:18px;border-radius:18px;border:1px solid rgba(255,255,255,0.7);text-align:center;">
+    <div style="font-size:22px;font-weight:900;color:#24115e;">Run the pipeline after uploading both Excel files</div>
+    <div style="margin-top:8px;color:#6f5cb5;">The dashboard, pricing recommendations, and AI assistant will populate automatically.</div>
+</div>
+"""
+with gr.Blocks(title="AI Hotel Pricing Optimizer", css=load_css()) as demo:
+    analysis_state = gr.State({})
     gr.Markdown(
+        "# AI-Powered Hotel Pricing Optimization and Guest Experience Analyzer\n"
+        "*Case-study tool for using real hotel reviews and synthetic business data to support pricing decisions.*",
         elem_id="escp_title",
     )
     with gr.Tab("Pipeline Runner"):
+        gr.Markdown(
+            """
+### Project Goal
+This app helps a luxury hotel chain decide where to **raise, hold, or lower prices**
+while protecting guest satisfaction. It combines:
+- **Real-world review analysis** for qualitative insight
+- **Synthetic/business data analysis** for quantitative insight
+- **Pricing recommendations** for management decision support
+"""
+        )
         with gr.Row():
+            real_file = gr.File(label="Upload real reviews Excel file", file_types=[".xlsx"])
+            synthetic_file = gr.File(label="Upload synthetic/business Excel file", file_types=[".xlsx"])
+        run_button = gr.Button("Run Full Hotel Pricing Analysis", variant="primary")
+        run_log = gr.Textbox(label="Execution Log", lines=16, interactive=False)
+        with gr.Row():
+            reviews_preview = gr.Dataframe(label="Real Reviews Preview", interactive=False)
+            business_preview = gr.Dataframe(label="Synthetic/Business Preview", interactive=False)
     with gr.Tab("Dashboard"):
+        kpi_html = gr.HTML(value=placeholder_kpis)
+        with gr.Row():
+            review_summary_md = gr.Markdown("Run the pipeline to generate the review summary.")
+            business_summary_md = gr.Markdown("Run the pipeline to generate the business summary.")
+        gr.Markdown("### Review Analysis")
+        with gr.Row():
+            sentiment_chart = gr.Plot(label="Sentiment Distribution")
+            theme_chart = gr.Plot(label="Top Review Themes")
+        with gr.Row():
+            rating_city_chart = gr.Plot(label="Average Rating by City")
+            price_city_chart = gr.Plot(label="Average Price by City")
+        with gr.Row():
+            occupancy_chart = gr.Plot(label="Occupancy by Room Type")
+            revenue_chart = gr.Plot(label="Revenue by City")
+        gr.Markdown("### Pricing Recommendations")
+        pricing_table = gr.Dataframe(label="Top Pricing Decisions", interactive=False)
     with gr.Tab('"AI" Dashboard'):
+        ai_status = (
+            "Connected to **n8n**." if N8N_WEBHOOK_URL
+            else "Connected to an **LLM**." if LLM_ENABLED
+            else "Using the built-in **rule-based assistant**. You can later upgrade this by adding `HF_API_KEY` or `N8N_WEBHOOK_URL` as Space secrets."
         )
         gr.Markdown(
+            f"""
+### Ask the Hotel Pricing Assistant
+{ai_status}
+Example questions:
+- What are the main complaints?
+- Where should prices be raised?
+- What should management fix first?
+- Give me a summary of the business situation.
+"""
         )
+        chatbot = gr.Chatbot(label="Conversation", height=420, type="messages")
+        ai_input = gr.Textbox(
+            label="Ask about your uploaded data",
+            placeholder="e.g. Where should prices be lowered?",
+            lines=1,
+        )
+        ai_input.submit(
+            ask_ai,
+            inputs=[ai_input, chatbot, analysis_state],
+            outputs=[chatbot, ai_input],
         )
+    run_button.click(
+        run_pipeline,
+        inputs=[real_file, synthetic_file],
+        outputs=[
+            run_log,
+            reviews_preview,
+            business_preview,
+            kpi_html,
+            review_summary_md,
+            business_summary_md,
+            sentiment_chart,
+            theme_chart,
+            rating_city_chart,
+            price_city_chart,
+            occupancy_chart,
+            revenue_chart,
+            pricing_table,
+            analysis_state,
+        ],
+    )
+demo.launch(allowed_paths=[str(BASE_DIR)])