Files changed (1) hide show
  1. app.py +863 -619
app.py CHANGED
@@ -1,17 +1,15 @@
1
  import os
2
  import re
3
  import json
4
- import time
5
- import traceback
6
  from pathlib import Path
7
- from typing import Dict, Any, List, Tuple
8
 
9
  import pandas as pd
10
  import gradio as gr
11
- import papermill as pm
12
  import plotly.graph_objects as go
13
 
14
- # Optional LLM (HuggingFace Inference API)
15
  try:
16
  from huggingface_hub import InferenceClient
17
  except Exception:
@@ -22,737 +20,983 @@ except Exception:
22
  # =========================================================
23
 
24
  BASE_DIR = Path(__file__).resolve().parent
25
-
26
- NB1 = os.environ.get("NB1", "datacreation.ipynb").strip()
27
- NB2 = os.environ.get("NB2", "pythonanalysis.ipynb").strip()
28
-
29
- RUNS_DIR = BASE_DIR / "runs"
30
- ART_DIR = BASE_DIR / "artifacts"
31
- PY_FIG_DIR = ART_DIR / "py" / "figures"
32
- PY_TAB_DIR = ART_DIR / "py" / "tables"
33
-
34
- PAPERMILL_TIMEOUT = int(os.environ.get("PAPERMILL_TIMEOUT", "1800"))
35
- MAX_PREVIEW_ROWS = int(os.environ.get("MAX_FILE_PREVIEW_ROWS", "50"))
36
- MAX_LOG_CHARS = int(os.environ.get("MAX_LOG_CHARS", "8000"))
37
-
38
  HF_API_KEY = os.environ.get("HF_API_KEY", "").strip()
39
- MODEL_NAME = os.environ.get("MODEL_NAME", "deepseek-ai/DeepSeek-R1").strip()
40
- HF_PROVIDER = os.environ.get("HF_PROVIDER", "novita").strip()
41
  N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip()
42
 
43
  LLM_ENABLED = bool(HF_API_KEY) and InferenceClient is not None
44
- llm_client = (
45
- InferenceClient(provider=HF_PROVIDER, api_key=HF_API_KEY)
46
- if LLM_ENABLED
47
- else None
48
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # =========================================================
51
  # HELPERS
52
  # =========================================================
53
 
54
- def ensure_dirs():
55
- for p in [RUNS_DIR, ART_DIR, PY_FIG_DIR, PY_TAB_DIR]:
56
- p.mkdir(parents=True, exist_ok=True)
57
-
58
- def stamp():
59
- return time.strftime("%Y%m%d-%H%M%S")
60
-
61
- def tail(text: str, n: int = MAX_LOG_CHARS) -> str:
62
- return (text or "")[-n:]
63
-
64
- def _ls(dir_path: Path, exts: Tuple[str, ...]) -> List[str]:
65
- if not dir_path.is_dir():
66
- return []
67
- return sorted(p.name for p in dir_path.iterdir() if p.is_file() and p.suffix.lower() in exts)
68
-
69
- def _read_csv(path: Path) -> pd.DataFrame:
70
- return pd.read_csv(path, nrows=MAX_PREVIEW_ROWS)
71
 
72
- def _read_json(path: Path):
73
- with path.open(encoding="utf-8") as f:
74
- return json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- def artifacts_index() -> Dict[str, Any]:
77
- return {
78
- "python": {
79
- "figures": _ls(PY_FIG_DIR, (".png", ".jpg", ".jpeg")),
80
- "tables": _ls(PY_TAB_DIR, (".csv", ".json")),
81
- },
82
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  # =========================================================
85
- # PIPELINE RUNNERS
86
  # =========================================================
87
 
88
- def run_notebook(nb_name: str) -> str:
89
- ensure_dirs()
90
- nb_in = BASE_DIR / nb_name
91
- if not nb_in.exists():
92
- return f"ERROR: {nb_name} not found."
93
- nb_out = RUNS_DIR / f"run_{stamp()}_{nb_name}"
94
- pm.execute_notebook(
95
- input_path=str(nb_in),
96
- output_path=str(nb_out),
97
- cwd=str(BASE_DIR),
98
- log_output=True,
99
- progress_bar=False,
100
- request_save_on_cell_execute=True,
101
- execution_timeout=PAPERMILL_TIMEOUT,
102
- )
103
- return f"Executed {nb_name}"
 
 
 
 
 
 
 
 
 
104
 
 
 
 
 
105
 
106
- def run_datacreation() -> str:
107
- try:
108
- log = run_notebook(NB1)
109
- csvs = [f.name for f in BASE_DIR.glob("*.csv")]
110
- return f"OK {log}\n\nCSVs now in /app:\n" + "\n".join(f" - {c}" for c in sorted(csvs))
111
- except Exception as e:
112
- return f"FAILED {e}\n\n{traceback.format_exc()[-2000:]}"
113
 
 
114
 
115
- def run_pythonanalysis() -> str:
116
- try:
117
- log = run_notebook(NB2)
118
- idx = artifacts_index()
119
- figs = idx["python"]["figures"]
120
- tabs = idx["python"]["tables"]
121
- return (
122
- f"OK {log}\n\n"
123
- f"Figures: {', '.join(figs) or '(none)'}\n"
124
- f"Tables: {', '.join(tabs) or '(none)'}"
125
  )
126
- except Exception as e:
127
- return f"FAILED {e}\n\n{traceback.format_exc()[-2000:]}"
128
-
129
-
130
- def run_full_pipeline() -> str:
131
- logs = []
132
- logs.append("=" * 50)
133
- logs.append("STEP 1/2: Data Creation (web scraping + synthetic data)")
134
- logs.append("=" * 50)
135
- logs.append(run_datacreation())
136
- logs.append("")
137
- logs.append("=" * 50)
138
- logs.append("STEP 2/2: Python Analysis (sentiment, ARIMA, dashboard)")
139
- logs.append("=" * 50)
140
- logs.append(run_pythonanalysis())
141
- return "\n".join(logs)
142
-
143
-
144
- # =========================================================
145
- # GALLERY LOADERS
146
- # =========================================================
147
-
148
- def _load_all_figures() -> List[Tuple[str, str]]:
149
- """Return list of (filepath, caption) for Gallery."""
150
- items = []
151
- for p in sorted(PY_FIG_DIR.glob("*.png")):
152
- items.append((str(p), p.stem.replace('_', ' ').title()))
153
- return items
154
-
155
-
156
- def _load_table_safe(path: Path) -> pd.DataFrame:
157
- try:
158
- if path.suffix == ".json":
159
- obj = _read_json(path)
160
- if isinstance(obj, dict):
161
- return pd.DataFrame([obj])
162
- return pd.DataFrame(obj)
163
- return _read_csv(path)
164
- except Exception as e:
165
- return pd.DataFrame([{"error": str(e)}])
166
-
167
 
168
- def refresh_gallery():
169
- """Called when user clicks Refresh on Gallery tab."""
170
- figures = _load_all_figures()
171
- idx = artifacts_index()
172
 
173
- table_choices = list(idx["python"]["tables"])
 
 
174
 
175
- default_df = pd.DataFrame()
176
- if table_choices:
177
- default_df = _load_table_safe(PY_TAB_DIR / table_choices[0])
178
 
179
- return (
180
- figures if figures else [],
181
- gr.update(choices=table_choices, value=table_choices[0] if table_choices else None),
182
- default_df,
 
183
  )
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
- def on_table_select(choice: str):
187
- if not choice:
188
- return pd.DataFrame([{"hint": "Select a table above."}])
189
- path = PY_TAB_DIR / choice
190
- if not path.exists():
191
- return pd.DataFrame([{"error": f"File not found: {choice}"}])
192
- return _load_table_safe(path)
193
-
 
 
 
 
 
 
 
 
 
194
 
195
  # =========================================================
196
- # KPI LOADER
197
  # =========================================================
198
 
199
- def load_kpis() -> Dict[str, Any]:
200
- for candidate in [PY_TAB_DIR / "kpis.json", PY_FIG_DIR / "kpis.json"]:
201
- if candidate.exists():
202
- try:
203
- return _read_json(candidate)
204
- except Exception:
205
- pass
206
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
 
 
 
 
208
 
209
- # =========================================================
210
- # AI DASHBOARD -- LLM picks what to display
211
- # =========================================================
 
212
 
213
- DASHBOARD_SYSTEM = """You are an AI dashboard assistant for a book-sales analytics app.
214
- The user asks questions or requests about their data. You have access to pre-computed
215
- artifacts from a Python analysis pipeline.
 
 
 
216
 
217
- AVAILABLE ARTIFACTS (only reference ones that exist):
218
- {artifacts_json}
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
- KPI SUMMARY: {kpis_json}
 
 
221
 
222
- YOUR JOB:
223
- 1. Answer the user's question conversationally using the KPIs and your knowledge of the artifacts.
224
- 2. At the END of your response, output a JSON block (fenced with ```json ... ```) that tells
225
- the dashboard which artifact to display. The JSON must have this shape:
226
- {{"show": "figure"|"table"|"none", "scope": "python", "filename": "..."}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- - Use "show": "figure" to display a chart image.
229
- - Use "show": "table" to display a CSV/JSON table.
230
- - Use "show": "none" if no artifact is relevant.
 
 
231
 
232
- RULES:
233
- - If the user asks about sales trends or forecasting by title, show sales_trends or arima figures.
234
- - If the user asks about sentiment, show sentiment figure or sentiment_counts table.
235
- - If the user asks about forecast accuracy or ARIMA, show arima figures.
236
- - If the user asks about top sellers, show top_titles_by_units_sold.csv.
237
- - If the user asks a general data question, pick the most relevant artifact.
238
- - Keep your answer concise (2-4 sentences), then the JSON block.
239
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
- JSON_BLOCK_RE = re.compile(r"```json\s*(\{.*?\})\s*```", re.DOTALL)
242
- FALLBACK_JSON_RE = re.compile(r"\{[^{}]*\"show\"[^{}]*\}", re.DOTALL)
 
 
 
243
 
 
244
 
245
- def _parse_display_directive(text: str) -> Dict[str, str]:
246
- m = JSON_BLOCK_RE.search(text)
247
- if m:
248
- try:
249
- return json.loads(m.group(1))
250
- except json.JSONDecodeError:
251
- pass
252
- m = FALLBACK_JSON_RE.search(text)
253
- if m:
254
- try:
255
- return json.loads(m.group(0))
256
- except json.JSONDecodeError:
257
- pass
258
- return {"show": "none"}
259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
- def _clean_response(text: str) -> str:
262
- """Strip the JSON directive block from the displayed response."""
263
- return JSON_BLOCK_RE.sub("", text).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
- def _n8n_call(msg: str) -> Tuple[str, Dict]:
267
- """Call the student's n8n webhook and return (reply, directive)."""
268
- import requests as req
269
- try:
270
- resp = req.post(N8N_WEBHOOK_URL, json={"question": msg}, timeout=20)
271
- data = resp.json()
272
- answer = data.get("answer", "No response from n8n workflow.")
273
- chart = data.get("chart", "none")
274
- if chart and chart != "none":
275
- return answer, {"show": "figure", "chart": chart}
276
- return answer, {"show": "none"}
277
- except Exception as e:
278
- return f"n8n error: {e}. Falling back to keyword matching.", None
 
 
279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
- def ai_chat(user_msg: str, history: list):
282
- """Chat function for the AI Dashboard tab."""
283
- if not user_msg or not user_msg.strip():
284
- return history, "", None, None
 
 
 
 
 
 
 
 
 
 
 
285
 
286
- idx = artifacts_index()
287
- kpis = load_kpis()
 
288
 
289
- # Priority: n8n webhook > HF LLM > keyword fallback
290
- if N8N_WEBHOOK_URL:
291
- reply, directive = _n8n_call(user_msg)
292
- if directive is None:
293
- reply_fb, directive = _keyword_fallback(user_msg, idx, kpis)
294
- reply += "\n\n" + reply_fb
295
- elif not LLM_ENABLED:
296
- reply, directive = _keyword_fallback(user_msg, idx, kpis)
297
- else:
298
- system = DASHBOARD_SYSTEM.format(
299
- artifacts_json=json.dumps(idx, indent=2),
300
- kpis_json=json.dumps(kpis, indent=2) if kpis else "(no KPIs yet, run the pipeline first)",
301
- )
302
- msgs = [{"role": "system", "content": system}]
303
- for entry in (history or [])[-6:]:
304
- msgs.append(entry)
305
- msgs.append({"role": "user", "content": user_msg})
306
-
307
- try:
308
- r = llm_client.chat_completion(
309
- model=MODEL_NAME,
310
- messages=msgs,
311
- temperature=0.3,
312
- max_tokens=600,
313
- stream=False,
314
- )
315
- raw = (
316
- r["choices"][0]["message"]["content"]
317
- if isinstance(r, dict)
318
- else r.choices[0].message.content
319
- )
320
- directive = _parse_display_directive(raw)
321
- reply = _clean_response(raw)
322
- except Exception as e:
323
- reply = f"LLM error: {e}. Falling back to keyword matching."
324
- reply_fb, directive = _keyword_fallback(user_msg, idx, kpis)
325
- reply += "\n\n" + reply_fb
326
-
327
- # Resolve artifacts — build interactive Plotly charts when possible
328
- chart_out = None
329
- tab_out = None
330
- show = directive.get("show", "none")
331
- fname = directive.get("filename", "")
332
- chart_name = directive.get("chart", "")
333
-
334
- # Interactive chart builders keyed by name
335
- chart_builders = {
336
- "sales": build_sales_chart,
337
- "sentiment": build_sentiment_chart,
338
- "top_sellers": build_top_sellers_chart,
339
- }
340
 
341
- if chart_name and chart_name in chart_builders:
342
- chart_out = chart_builders[chart_name]()
343
- elif show == "figure" and fname:
344
- # Fallback: try to match filename to a chart builder
345
- if "sales_trend" in fname:
346
- chart_out = build_sales_chart()
347
- elif "sentiment" in fname:
348
- chart_out = build_sentiment_chart()
349
- elif "arima" in fname or "forecast" in fname:
350
- chart_out = build_sales_chart() # closest interactive equivalent
351
- else:
352
- chart_out = _empty_chart(f"No interactive chart for {fname}")
353
 
354
- if show == "table" and fname:
355
- fp = PY_TAB_DIR / fname
356
- if fp.exists():
357
- tab_out = _load_table_safe(fp)
358
- else:
359
- reply += f"\n\n*(Could not find table: {fname})*"
360
 
361
- new_history = (history or []) + [
362
- {"role": "user", "content": user_msg},
363
- {"role": "assistant", "content": reply},
364
- ]
 
 
 
365
 
366
- return new_history, "", chart_out, tab_out
 
367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
- def _keyword_fallback(msg: str, idx: Dict, kpis: Dict) -> Tuple[str, Dict]:
370
- """Simple keyword matcher when LLM is unavailable."""
371
- msg_lower = msg.lower()
372
 
373
- if not idx["python"]["figures"] and not idx["python"]["tables"]:
374
- return (
375
- "No artifacts found yet. Please run the pipeline first (Tab 1), "
376
- "then come back here to explore the results.",
377
- {"show": "none"},
378
- )
379
 
380
- kpi_text = ""
381
- if kpis:
382
- total = kpis.get("total_units_sold", 0)
383
- kpi_text = (
384
- f"Quick summary: **{kpis.get('n_titles', '?')}** book titles across "
385
- f"**{kpis.get('n_months', '?')}** months, with **{total:,.0f}** total units sold."
386
- )
387
 
388
- if any(w in msg_lower for w in ["trend", "sales trend", "monthly sale"]):
389
- return (
390
- f"Here are the sales trends. {kpi_text}",
391
- {"show": "figure", "chart": "sales"},
392
- )
393
 
394
- if any(w in msg_lower for w in ["sentiment", "review", "positive", "negative"]):
395
- return (
396
- f"Here is the sentiment distribution across sampled book titles. {kpi_text}",
397
- {"show": "figure", "chart": "sentiment"},
398
- )
399
 
400
- if any(w in msg_lower for w in ["arima", "forecast", "predict"]):
401
- return (
402
- f"Here are the sales trends and forecasts. {kpi_text}",
403
- {"show": "figure", "chart": "sales"},
404
- )
405
 
406
- if any(w in msg_lower for w in ["top", "best sell", "popular", "rank"]):
407
- return (
408
- f"Here are the top-selling titles by units sold. {kpi_text}",
409
- {"show": "table", "scope": "python", "filename": "top_titles_by_units_sold.csv"},
410
- )
411
 
412
- if any(w in msg_lower for w in ["price", "pricing", "decision"]):
413
- return (
414
- f"Here are the pricing decisions. {kpi_text}",
415
- {"show": "table", "scope": "python", "filename": "pricing_decisions.csv"},
416
- )
 
 
417
 
418
- if any(w in msg_lower for w in ["dashboard", "overview", "summary", "kpi"]):
419
- return (
420
- f"Dashboard overview: {kpi_text}\n\nAsk me about sales trends, sentiment, forecasts, "
421
- "pricing, or top sellers to see specific visualizations.",
422
- {"show": "table", "scope": "python", "filename": "df_dashboard.csv"},
423
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
- # Default
426
  return (
427
- f"I can show you various analyses. {kpi_text}\n\n"
428
- "Try asking about: **sales trends**, **sentiment**, **ARIMA forecasts**, "
429
- "**pricing decisions**, **top sellers**, or **dashboard overview**.",
430
- {"show": "none"},
 
 
 
 
 
 
 
 
 
 
431
  )
432
 
433
-
434
  # =========================================================
435
- # KPI CARDS (BubbleBusters style)
436
  # =========================================================
437
 
438
- def render_kpi_cards() -> str:
439
- kpis = load_kpis()
440
- if not kpis:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  return (
442
- '<div style="background:rgba(255,255,255,.65);backdrop-filter:blur(16px);'
443
- 'border-radius:20px;padding:28px;text-align:center;'
444
- 'border:1.5px solid rgba(255,255,255,.7);'
445
- 'box-shadow:0 8px 32px rgba(124,92,191,.08);">'
446
- '<div style="font-size:36px;margin-bottom:10px;">📊</div>'
447
- '<div style="color:#a48de8;font-size:14px;'
448
- 'font-weight:800;margin-bottom:6px;">No data yet</div>'
449
- '<div style="color:#9d8fc4;font-size:12px;">'
450
- 'Run the pipeline to populate these cards.</div>'
451
- '</div>'
452
  )
453
 
454
- def card(icon, label, value, colour):
455
- return f"""
456
- <div style="background:rgba(255,255,255,.72);backdrop-filter:blur(16px);
457
- border-radius:20px;padding:18px 14px 16px;text-align:center;
458
- border:1.5px solid rgba(255,255,255,.8);
459
- box-shadow:0 4px 16px rgba(124,92,191,.08);
460
- border-top:3px solid {colour};">
461
- <div style="font-size:26px;margin-bottom:7px;line-height:1;">{icon}</div>
462
- <div style="color:#9d8fc4;font-size:9.5px;text-transform:uppercase;
463
- letter-spacing:1.8px;margin-bottom:7px;font-weight:800;">{label}</div>
464
- <div style="color:#2d1f4e;font-size:16px;font-weight:800;">{value}</div>
465
- </div>"""
466
-
467
- kpi_config = [
468
- ("n_titles", "📚", "Book Titles", "#a48de8"),
469
- ("n_months", "📅", "Time Periods", "#7aa6f8"),
470
- ("total_units_sold", "📦", "Units Sold", "#6ee7c7"),
471
- ("total_revenue", "💰", "Revenue", "#3dcba8"),
472
- ]
473
-
474
- html = (
475
- '<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));'
476
- 'gap:12px;margin-bottom:24px;">'
477
  )
478
- for key, icon, label, colour in kpi_config:
479
- val = kpis.get(key)
480
- if val is None:
481
- continue
482
- if isinstance(val, (int, float)) and val > 100:
483
- val = f"{val:,.0f}"
484
- html += card(icon, label, str(val), colour)
485
- # Extra KPIs not in config
486
- known = {k for k, *_ in kpi_config}
487
- for key, val in kpis.items():
488
- if key not in known:
489
- label = key.replace("_", " ").title()
490
- if isinstance(val, (int, float)) and val > 100:
491
- val = f"{val:,.0f}"
492
- html += card("📈", label, str(val), "#8fa8f8")
493
- html += "</div>"
494
- return html
495
 
 
 
 
496
 
497
- # =========================================================
498
- # INTERACTIVE PLOTLY CHARTS (BubbleBusters style)
499
- # =========================================================
 
500
 
501
- CHART_PALETTE = ["#7c5cbf", "#2ec4a0", "#e8537a", "#e8a230", "#5e8fef",
502
- "#c45ea8", "#3dbacc", "#a0522d", "#6aaa3a", "#d46060"]
503
 
504
- def _styled_layout(**kwargs) -> dict:
505
- defaults = dict(
506
- template="plotly_white",
507
- paper_bgcolor="rgba(255,255,255,0.95)",
508
- plot_bgcolor="rgba(255,255,255,0.98)",
509
- font=dict(family="system-ui, sans-serif", color="#2d1f4e", size=12),
510
- margin=dict(l=60, r=20, t=70, b=70),
511
- legend=dict(
512
- orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
513
- bgcolor="rgba(255,255,255,0.92)",
514
- bordercolor="rgba(124,92,191,0.35)", borderwidth=1,
515
- ),
516
- title=dict(font=dict(size=15, color="#4b2d8a")),
517
- )
518
- defaults.update(kwargs)
519
- return defaults
520
 
 
 
521
 
522
- def _empty_chart(title: str) -> go.Figure:
523
- fig = go.Figure()
524
- fig.update_layout(
525
- title=title, height=420, template="plotly_white",
526
- paper_bgcolor="rgba(255,255,255,0.95)",
527
- annotations=[dict(text="Run the pipeline to generate data",
528
- x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False,
529
- font=dict(size=14, color="rgba(124,92,191,0.5)"))],
530
- )
531
- return fig
532
-
533
 
534
- def build_sales_chart() -> go.Figure:
535
- path = PY_TAB_DIR / "df_dashboard.csv"
536
- if not path.exists():
537
- return _empty_chart("Sales Trends — run the pipeline first")
538
- df = pd.read_csv(path)
539
- date_col = next((c for c in df.columns if "month" in c.lower() or "date" in c.lower()), None)
540
- val_cols = [c for c in df.columns if c != date_col and df[c].dtype in ("float64", "int64")]
541
- if not date_col or not val_cols:
542
- return _empty_chart("Could not auto-detect columns in df_dashboard.csv")
543
- df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
544
- fig = go.Figure()
545
- for i, col in enumerate(val_cols):
546
- fig.add_trace(go.Scatter(
547
- x=df[date_col], y=df[col], name=col.replace("_", " ").title(),
548
- mode="lines+markers", line=dict(color=CHART_PALETTE[i % len(CHART_PALETTE)], width=2),
549
- marker=dict(size=4),
550
- hovertemplate=f"<b>{col.replace('_',' ').title()}</b><br>%{{x|%b %Y}}: %{{y:,.0f}}<extra></extra>",
551
- ))
552
- fig.update_layout(**_styled_layout(height=450, hovermode="x unified",
553
- title=dict(text="Monthly Overview")))
554
- fig.update_xaxes(gridcolor="rgba(124,92,191,0.15)", showgrid=True)
555
- fig.update_yaxes(gridcolor="rgba(124,92,191,0.15)", showgrid=True)
556
- return fig
557
-
558
-
559
- def build_sentiment_chart() -> go.Figure:
560
- path = PY_TAB_DIR / "sentiment_counts_sampled.csv"
561
- if not path.exists():
562
- return _empty_chart("Sentiment Distribution — run the pipeline first")
563
- df = pd.read_csv(path)
564
- title_col = df.columns[0]
565
- sent_cols = [c for c in ["negative", "neutral", "positive"] if c in df.columns]
566
- if not sent_cols:
567
- return _empty_chart("No sentiment columns found in CSV")
568
- colors = {"negative": "#e8537a", "neutral": "#5e8fef", "positive": "#2ec4a0"}
569
- fig = go.Figure()
570
- for col in sent_cols:
571
- fig.add_trace(go.Bar(
572
- name=col.title(), y=df[title_col], x=df[col],
573
- orientation="h", marker_color=colors.get(col, "#888"),
574
- hovertemplate=f"<b>{col.title()}</b>: %{{x}}<extra></extra>",
575
- ))
576
- fig.update_layout(**_styled_layout(
577
- height=max(400, len(df) * 28), barmode="stack",
578
- title=dict(text="Sentiment Distribution by Book"),
579
- ))
580
- fig.update_xaxes(title="Number of Reviews")
581
- fig.update_yaxes(autorange="reversed")
582
- return fig
583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
584
 
585
- def build_top_sellers_chart() -> go.Figure:
586
- path = PY_TAB_DIR / "top_titles_by_units_sold.csv"
587
- if not path.exists():
588
- return _empty_chart("Top Sellers — run the pipeline first")
589
- df = pd.read_csv(path).head(15)
590
- title_col = next((c for c in df.columns if "title" in c.lower()), df.columns[0])
591
- val_col = next((c for c in df.columns if "unit" in c.lower() or "sold" in c.lower()), df.columns[-1])
592
- fig = go.Figure(go.Bar(
593
- y=df[title_col], x=df[val_col], orientation="h",
594
- marker=dict(color=df[val_col], colorscale=[[0, "#c5b4f0"], [1, "#7c5cbf"]]),
595
- hovertemplate="<b>%{y}</b><br>Units: %{x:,.0f}<extra></extra>",
596
- ))
597
- fig.update_layout(**_styled_layout(
598
- height=max(400, len(df) * 30),
599
- title=dict(text="Top Selling Titles"), showlegend=False,
600
- ))
601
- fig.update_yaxes(autorange="reversed")
602
- fig.update_xaxes(title="Total Units Sold")
603
- return fig
604
 
 
605
 
606
- def refresh_dashboard():
607
- return render_kpi_cards(), build_sales_chart(), build_sentiment_chart(), build_top_sellers_chart()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
 
 
 
 
 
 
609
 
610
  # =========================================================
611
  # UI
612
  # =========================================================
613
 
614
- ensure_dirs()
615
-
616
- def load_css() -> str:
617
- css_path = BASE_DIR / "style.css"
618
- return css_path.read_text(encoding="utf-8") if css_path.exists() else ""
619
-
620
 
621
- with gr.Blocks(title="AIBDM 2026 Workshop App") as demo:
 
622
 
623
  gr.Markdown(
624
- "# SE21 App Template\n"
625
- "*This is an app template for SE21 students*",
626
  elem_id="escp_title",
627
  )
628
 
629
- # ===========================================================
630
- # TAB 1 -- Pipeline Runner
631
- # ===========================================================
632
  with gr.Tab("Pipeline Runner"):
633
- gr.Markdown()
634
-
635
- with gr.Row():
636
- with gr.Column(scale=1):
637
- btn_nb1 = gr.Button("Step 1: Data Creation", variant="secondary")
638
- with gr.Column(scale=1):
639
- btn_nb2 = gr.Button("Step 2: Python Analysis", variant="secondary")
 
 
 
 
640
 
641
  with gr.Row():
642
- btn_all = gr.Button("Run Full Pipeline (Both Steps)", variant="primary")
 
643
 
644
- run_log = gr.Textbox(
645
- label="Execution Log",
646
- lines=18,
647
- max_lines=30,
648
- interactive=False,
649
- )
650
 
651
- btn_nb1.click(run_datacreation, outputs=[run_log])
652
- btn_nb2.click(run_pythonanalysis, outputs=[run_log])
653
- btn_all.click(run_full_pipeline, outputs=[run_log])
654
 
655
- # ===========================================================
656
- # TAB 2 -- Dashboard (KPIs + Interactive Charts + Gallery)
657
- # ===========================================================
658
  with gr.Tab("Dashboard"):
659
- kpi_html = gr.HTML(value=render_kpi_cards)
660
-
661
- refresh_btn = gr.Button("Refresh Dashboard", variant="primary")
662
 
663
- gr.Markdown("#### Interactive Charts")
664
- chart_sales = gr.Plot(label="Monthly Overview")
665
- chart_sentiment = gr.Plot(label="Sentiment Distribution")
666
- chart_top = gr.Plot(label="Top Sellers")
667
 
668
- gr.Markdown("#### Static Figures (from notebooks)")
669
- gallery = gr.Gallery(
670
- label="Generated Figures",
671
- columns=2,
672
- height=480,
673
- object_fit="contain",
674
- )
675
 
676
- gr.Markdown("#### Data Tables")
677
- table_dropdown = gr.Dropdown(
678
- label="Select a table to view",
679
- choices=[],
680
- interactive=True,
681
- )
682
- table_display = gr.Dataframe(
683
- label="Table Preview",
684
- interactive=False,
685
- )
686
 
687
- def _on_refresh():
688
- kpi, c1, c2, c3 = refresh_dashboard()
689
- figs, dd, df = refresh_gallery()
690
- return kpi, c1, c2, c3, figs, dd, df
691
 
692
- refresh_btn.click(
693
- _on_refresh,
694
- outputs=[kpi_html, chart_sales, chart_sentiment, chart_top,
695
- gallery, table_dropdown, table_display],
696
- )
697
- table_dropdown.change(
698
- on_table_select,
699
- inputs=[table_dropdown],
700
- outputs=[table_display],
701
- )
702
 
703
- # ===========================================================
704
- # TAB 3 -- AI Dashboard
705
- # ===========================================================
706
  with gr.Tab('"AI" Dashboard'):
707
- _ai_status = (
708
- "Connected to your **n8n workflow**." if N8N_WEBHOOK_URL
709
- else "**LLM active.**" if LLM_ENABLED
710
- else "Using **keyword matching**. Upgrade options: "
711
- "set `N8N_WEBHOOK_URL` to connect your n8n workflow, "
712
- "or set `HF_API_KEY` for direct LLM access."
713
  )
 
714
  gr.Markdown(
715
- "### Ask questions, get interactive visualisations\n\n"
716
- f"Type a question and the system will pick the right interactive chart or table. {_ai_status}"
 
 
 
 
 
 
 
 
717
  )
718
 
719
- with gr.Row(equal_height=True):
720
- with gr.Column(scale=1):
721
- chatbot = gr.Chatbot(
722
- label="Conversation",
723
- height=380,
724
- )
725
- user_input = gr.Textbox(
726
- label="Ask about your data",
727
- placeholder="e.g. Show me sales trends / What are the top sellers? / Sentiment analysis",
728
- lines=1,
729
- )
730
- gr.Examples(
731
- examples=[
732
- "Show me the sales trends",
733
- "What does the sentiment look like?",
734
- "Which titles sell the most?",
735
- "Show the ARIMA forecasts",
736
- "What are the pricing decisions?",
737
- "Give me a dashboard overview",
738
- ],
739
- inputs=user_input,
740
- )
741
-
742
- with gr.Column(scale=1):
743
- ai_figure = gr.Plot(
744
- label="Interactive Chart",
745
- )
746
- ai_table = gr.Dataframe(
747
- label="Data Table",
748
- interactive=False,
749
- )
750
 
751
- user_input.submit(
752
- ai_chat,
753
- inputs=[user_input, chatbot],
754
- outputs=[chatbot, user_input, ai_figure, ai_table],
755
  )
756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
 
758
- demo.launch(css=load_css(), allowed_paths=[str(BASE_DIR)])
 
1
  import os
2
  import re
3
  import json
 
 
4
  from pathlib import Path
5
+ from collections import Counter
6
 
7
  import pandas as pd
8
  import gradio as gr
9
+ import plotly.express as px
10
  import plotly.graph_objects as go
11
 
12
+ # Optional LLM support
13
  try:
14
  from huggingface_hub import InferenceClient
15
  except Exception:
 
20
  # =========================================================
21
 
22
  BASE_DIR = Path(__file__).resolve().parent
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  HF_API_KEY = os.environ.get("HF_API_KEY", "").strip()
24
+ MODEL_NAME = os.environ.get("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct").strip()
 
25
  N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip()
26
 
27
  LLM_ENABLED = bool(HF_API_KEY) and InferenceClient is not None
28
+ llm_client = InferenceClient(api_key=HF_API_KEY) if LLM_ENABLED else None
29
+
30
+ MAX_PREVIEW_ROWS = 15
31
+ IGNORE_SHEETS = {"data_dictionary", "sources", "source", "readme", "metadata"}
32
+
33
+ POSITIVE_WORDS = {
34
+ "great", "excellent", "amazing", "wonderful", "perfect", "pleasant", "friendly",
35
+ "clean", "comfortable", "beautiful", "fantastic", "helpful", "enjoyed", "loved",
36
+ "luxurious", "smooth", "spacious", "professional", "quiet", "impressive"
37
+ }
38
+
39
+ NEGATIVE_WORDS = {
40
+ "bad", "poor", "terrible", "awful", "dirty", "slow", "rude", "noisy", "expensive",
41
+ "disappointing", "uncomfortable", "broken", "worst", "late", "smell", "smelly",
42
+ "small", "crowded", "issue", "problem", "delay", "unhelpful", "overpriced"
43
+ }
44
+
45
+ THEME_KEYWORDS = {
46
+ "cleanliness": ["clean", "dirty", "smell", "smelly", "hygiene", "stain", "dust"],
47
+ "staff": ["staff", "service", "reception", "manager", "employee", "friendly", "rude", "helpful"],
48
+ "check_in": ["check-in", "check in", "queue", "waiting", "late", "front desk"],
49
+ "room_comfort": ["bed", "pillow", "comfortable", "room", "sleep", "spacious", "small"],
50
+ "noise": ["noise", "noisy", "loud", "street", "neighbors"],
51
+ "breakfast_food": ["breakfast", "food", "restaurant", "buffet", "dinner"],
52
+ "location": ["location", "near", "far", "transport", "view", "airport"],
53
+ "value_price": ["price", "expensive", "cheap", "value", "worth", "overpriced"]
54
+ }
55
 
56
  # =========================================================
57
  # HELPERS
58
  # =========================================================
59
 
60
+ def load_css() -> str:
61
+ css_path = BASE_DIR / "style.css"
62
+ return css_path.read_text(encoding="utf-8") if css_path.exists() else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ def normalize_columns(columns):
65
+ clean = []
66
+ for col in columns:
67
+ c = str(col).strip().lower()
68
+ c = re.sub(r"[^\w\s]", "", c)
69
+ c = re.sub(r"\s+", "_", c)
70
+ clean.append(c)
71
+ return clean
72
+
73
+ def format_num(x):
74
+ if x is None or pd.isna(x):
75
+ return "N/A"
76
+ if isinstance(x, (int, float)):
77
+ if abs(x) >= 1000:
78
+ return f"{x:,.0f}"
79
+ return f"{x:.2f}"
80
+ return str(x)
81
+
82
+ def format_pct(x):
83
+ if x is None or pd.isna(x):
84
+ return "N/A"
85
+ return f"{x * 100:.1f}%"
86
+
87
+ def empty_figure(title: str, message: str = "No data available yet") -> go.Figure:
88
+ fig = go.Figure()
89
+ fig.update_layout(
90
+ title=title,
91
+ template="plotly_white",
92
+ paper_bgcolor="rgba(255,255,255,0.95)",
93
+ plot_bgcolor="rgba(255,255,255,0.98)",
94
+ height=420,
95
+ annotations=[
96
+ dict(
97
+ text=message,
98
+ x=0.5,
99
+ y=0.5,
100
+ xref="paper",
101
+ yref="paper",
102
+ showarrow=False,
103
+ font=dict(size=15, color="rgba(53,32,138,0.65)")
104
+ )
105
+ ]
106
+ )
107
+ return fig
108
 
109
+ def coerce_numeric(series: pd.Series) -> pd.Series:
110
+ return pd.to_numeric(series, errors="coerce")
111
+
112
+ def normalize_rate(series: pd.Series) -> pd.Series:
113
+ s = coerce_numeric(series)
114
+ if s.dropna().empty:
115
+ return s
116
+ if s.max() > 1.5:
117
+ s = s / 100.0
118
+ return s
119
+
120
+ def find_first_column(df: pd.DataFrame, candidates):
121
+ for c in candidates:
122
+ if c in df.columns:
123
+ return c
124
+ return None
125
+
126
+ def pick_primary_sheet(file_path: str) -> pd.DataFrame:
127
+ excel = pd.ExcelFile(file_path)
128
+ sheet_names = excel.sheet_names
129
+ valid_sheets = [s for s in sheet_names if s.strip().lower() not in IGNORE_SHEETS]
130
+ chosen = valid_sheets[0] if valid_sheets else sheet_names[0]
131
+ df = pd.read_excel(file_path, sheet_name=chosen)
132
+ df.columns = normalize_columns(df.columns)
133
+ return df
134
+
135
+ def read_uploaded_excel(file_obj):
136
+ if file_obj is None:
137
+ return None
138
+ path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
139
+ return pick_primary_sheet(path)
140
+
141
+ def clip_text(text, n=220):
142
+ text = str(text) if text is not None else ""
143
+ text = re.sub(r"\s+", " ", text).strip()
144
+ return text if len(text) <= n else text[: n - 3] + "..."
145
+
146
+ def simple_sentiment_score(text: str) -> float:
147
+ if not text:
148
+ return 0.0
149
+ words = re.findall(r"[a-zA-Z']+", str(text).lower())
150
+ if not words:
151
+ return 0.0
152
+ pos = sum(1 for w in words if w in POSITIVE_WORDS)
153
+ neg = sum(1 for w in words if w in NEGATIVE_WORDS)
154
+ return (pos - neg) / max(len(words), 8)
155
+
156
+ def sentiment_label_from_score(score: float) -> str:
157
+ if score >= 0.03:
158
+ return "positive"
159
+ if score <= -0.03:
160
+ return "negative"
161
+ return "neutral"
162
+
163
+ def detect_themes(text: str):
164
+ text_lower = str(text).lower()
165
+ matches = []
166
+ for theme, keywords in THEME_KEYWORDS.items():
167
+ if any(k in text_lower for k in keywords):
168
+ matches.append(theme)
169
+ return matches if matches else ["general"]
170
 
171
  # =========================================================
172
+ # REVIEW ANALYSIS
173
  # =========================================================
174
 
175
+ def analyze_reviews(df: pd.DataFrame):
176
+ work = df.copy()
177
+
178
+ text_col = find_first_column(work, ["review_text", "text", "review", "content"])
179
+ title_col = find_first_column(work, ["review_title", "title", "headline"])
180
+ rating_col = find_first_column(work, ["rating", "score", "stars", "review_score"])
181
+ city_col = find_first_column(work, ["city", "location_city"])
182
+ hotel_col = find_first_column(work, ["hotel_name", "name", "hotel"])
183
+ date_col = find_first_column(work, ["review_date", "date", "stay_date"])
184
+ sentiment_col = find_first_column(work, ["sentiment_score"])
185
+ theme_col = find_first_column(work, ["detected_theme", "theme"])
186
+
187
+ if text_col is None and title_col is None:
188
+ raise gr.Error("The real reviews file needs at least a review text or review title column.")
189
+
190
+ if text_col is None:
191
+ work["review_text"] = work[title_col].fillna("").astype(str)
192
+ text_col = "review_text"
193
+
194
+ if title_col and title_col != text_col:
195
+ work["combined_text"] = (
196
+ work[title_col].fillna("").astype(str) + " " + work[text_col].fillna("").astype(str)
197
+ ).str.strip()
198
+ else:
199
+ work["combined_text"] = work[text_col].fillna("").astype(str)
200
 
201
+ if rating_col:
202
+ work["rating_num"] = coerce_numeric(work[rating_col])
203
+ else:
204
+ work["rating_num"] = pd.NA
205
 
206
+ if sentiment_col:
207
+ work["sentiment_score_calc"] = coerce_numeric(work[sentiment_col]).fillna(0.0)
208
+ else:
209
+ work["sentiment_score_calc"] = work["combined_text"].apply(simple_sentiment_score)
 
 
 
210
 
211
+ work["sentiment_label_calc"] = work["sentiment_score_calc"].apply(sentiment_label_from_score)
212
 
213
+ if theme_col:
214
+ work["theme_list_calc"] = work[theme_col].fillna("general").astype(str).apply(
215
+ lambda x: [t.strip() for t in str(x).split(",") if t.strip()]
 
 
 
 
 
 
 
216
  )
217
+ else:
218
+ work["theme_list_calc"] = work["combined_text"].apply(detect_themes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
+ work["primary_theme"] = work["theme_list_calc"].apply(lambda x: x[0] if x else "general")
 
 
 
221
 
222
+ if city_col is None:
223
+ work["city"] = "Unknown"
224
+ city_col = "city"
225
 
226
+ if hotel_col is None:
227
+ work["hotel_name"] = "Unknown"
228
+ hotel_col = "hotel_name"
229
 
230
+ sentiment_counts = (
231
+ work["sentiment_label_calc"]
232
+ .value_counts()
233
+ .reindex(["positive", "neutral", "negative"], fill_value=0)
234
+ .to_dict()
235
  )
236
 
237
+ avg_rating = work["rating_num"].mean() if "rating_num" in work else None
238
+
239
+ pos_themes = Counter()
240
+ neg_themes = Counter()
241
+ all_themes = Counter()
242
+
243
+ for _, row in work.iterrows():
244
+ themes = row["theme_list_calc"]
245
+ for t in themes:
246
+ all_themes[t] += 1
247
+ if row["sentiment_label_calc"] == "positive":
248
+ pos_themes[t] += 1
249
+ elif row["sentiment_label_calc"] == "negative":
250
+ neg_themes[t] += 1
251
+
252
+ pos_df = work[work["sentiment_label_calc"] == "positive"].copy()
253
+ neg_df = work[work["sentiment_label_calc"] == "negative"].copy()
254
+
255
+ pos_example = ""
256
+ neg_example = ""
257
+
258
+ if not pos_df.empty:
259
+ pos_df = pos_df.sort_values(["sentiment_score_calc", "rating_num"], ascending=[False, False], na_position="last")
260
+ pos_example = clip_text(pos_df.iloc[0]["combined_text"])
261
+
262
+ if not neg_df.empty:
263
+ neg_df = neg_df.sort_values(["sentiment_score_calc", "rating_num"], ascending=[True, True], na_position="last")
264
+ neg_example = clip_text(neg_df.iloc[0]["combined_text"])
265
+
266
+ rating_city = pd.DataFrame()
267
+ if city_col in work.columns and "rating_num" in work.columns and work["rating_num"].notna().any():
268
+ rating_city = (
269
+ work.groupby(city_col, dropna=False)
270
+ .agg(review_count=("combined_text", "count"), average_rating=("rating_num", "mean"))
271
+ .reset_index()
272
+ .sort_values("average_rating", ascending=False)
273
+ )
274
 
275
+ summary = {
276
+ "review_count": int(len(work)),
277
+ "avg_rating": float(avg_rating) if pd.notna(avg_rating) else None,
278
+ "sentiment_counts": sentiment_counts,
279
+ "top_themes": dict(all_themes.most_common(8)),
280
+ "top_positive_themes": dict(pos_themes.most_common(5)),
281
+ "top_negative_themes": dict(neg_themes.most_common(5)),
282
+ "positive_example": pos_example,
283
+ "negative_example": neg_example,
284
+ "city_table": rating_city,
285
+ "clean_df": work[[c for c in [
286
+ hotel_col, city_col, date_col, "combined_text", "rating_num",
287
+ "sentiment_score_calc", "sentiment_label_calc", "primary_theme"
288
+ ] if c is not None and c in work.columns]].head(MAX_PREVIEW_ROWS),
289
+ "full_df": work,
290
+ }
291
+ return summary
292
 
293
  # =========================================================
294
+ # BUSINESS ANALYSIS
295
  # =========================================================
296
 
297
+ def analyze_business(df: pd.DataFrame):
298
+ work = df.copy()
299
+
300
+ city_col = find_first_column(work, ["city"])
301
+ hotel_col = find_first_column(work, ["hotel_name", "name", "hotel"])
302
+ room_col = find_first_column(work, ["room_type", "room_category"])
303
+ date_col = find_first_column(work, ["date", "week", "month"])
304
+ price_col = find_first_column(work, ["nightly_price", "price", "avg_daily_rate"])
305
+ occ_col = find_first_column(work, ["occupancy_rate", "occupancy"])
306
+ cancel_col = find_first_column(work, ["cancellation_rate", "cancellations"])
307
+ revenue_col = find_first_column(work, ["revenue"])
308
+ rooms_booked_col = find_first_column(work, ["rooms_booked", "bookings"])
309
+
310
+ if city_col is None:
311
+ work["city"] = "Unknown"
312
+ city_col = "city"
313
+
314
+ if hotel_col is None:
315
+ work["hotel_name"] = "Unknown"
316
+ hotel_col = "hotel_name"
317
+
318
+ if room_col is None:
319
+ work["room_type"] = "Unknown"
320
+ room_col = "room_type"
321
+
322
+ if price_col:
323
+ work["nightly_price_num"] = coerce_numeric(work[price_col])
324
+ else:
325
+ work["nightly_price_num"] = pd.NA
326
 
327
+ if occ_col:
328
+ work["occupancy_rate_num"] = normalize_rate(work[occ_col])
329
+ else:
330
+ work["occupancy_rate_num"] = pd.NA
331
 
332
+ if cancel_col:
333
+ work["cancellation_rate_num"] = normalize_rate(work[cancel_col])
334
+ else:
335
+ work["cancellation_rate_num"] = pd.NA
336
 
337
+ if revenue_col:
338
+ work["revenue_num"] = coerce_numeric(work[revenue_col])
339
+ elif price_col and rooms_booked_col:
340
+ work["revenue_num"] = coerce_numeric(work[price_col]) * coerce_numeric(work[rooms_booked_col])
341
+ else:
342
+ work["revenue_num"] = pd.NA
343
 
344
+ if date_col:
345
+ work["date_num"] = pd.to_datetime(work[date_col], errors="coerce")
346
+ else:
347
+ work["date_num"] = pd.NaT
348
+
349
+ summary = {
350
+ "avg_price": float(work["nightly_price_num"].mean()) if work["nightly_price_num"].notna().any() else None,
351
+ "avg_occupancy": float(work["occupancy_rate_num"].mean()) if work["occupancy_rate_num"].notna().any() else None,
352
+ "avg_cancellation": float(work["cancellation_rate_num"].mean()) if work["cancellation_rate_num"].notna().any() else None,
353
+ "total_revenue": float(work["revenue_num"].sum()) if work["revenue_num"].notna().any() else None,
354
+ "row_count": int(len(work)),
355
+ "clean_df": work.head(MAX_PREVIEW_ROWS),
356
+ "full_df": work,
357
+ }
358
+ return summary
359
 
360
+ # =========================================================
361
+ # PRICING LOGIC
362
+ # =========================================================
363
 
364
+ def most_common_negative_theme(series_of_lists):
365
+ counter = Counter()
366
+ for item in series_of_lists:
367
+ if isinstance(item, list):
368
+ for t in item:
369
+ counter[t] += 1
370
+ return counter.most_common(1)[0][0] if counter else "general"
371
+
372
+ def build_pricing_recommendations(review_summary, business_summary):
373
+ review_df = review_summary["full_df"].copy()
374
+ business_df = business_summary["full_df"].copy()
375
+
376
+ join_keys = [k for k in ["city", "hotel_name"] if k in review_df.columns and k in business_df.columns]
377
+ if not join_keys:
378
+ review_df["portfolio"] = "All Hotels"
379
+ business_df["portfolio"] = "All Hotels"
380
+ join_keys = ["portfolio"]
381
+
382
+ review_df["negative_flag"] = (review_df["sentiment_label_calc"] == "negative").astype(int)
383
+
384
+ review_group = (
385
+ review_df.groupby(join_keys, dropna=False)
386
+ .agg(
387
+ avg_rating=("rating_num", "mean"),
388
+ avg_sentiment=("sentiment_score_calc", "mean"),
389
+ negative_share=("negative_flag", "mean"),
390
+ review_count=("combined_text", "count"),
391
+ )
392
+ .reset_index()
393
+ )
394
 
395
+ theme_group = (
396
+ review_df.groupby(join_keys, dropna=False)["theme_list_calc"]
397
+ .apply(most_common_negative_theme)
398
+ .reset_index(name="priority_issue")
399
+ )
400
 
401
+ review_group = review_group.merge(theme_group, on=join_keys, how="left")
402
+
403
+ business_group = business_df.groupby(join_keys, dropna=False).agg(
404
+ avg_price=("nightly_price_num", "mean"),
405
+ avg_occupancy=("occupancy_rate_num", "mean"),
406
+ avg_cancellation=("cancellation_rate_num", "mean"),
407
+ total_revenue=("revenue_num", "sum"),
408
+ ).reset_index()
409
+
410
+ merged = business_group.merge(review_group, on=join_keys, how="left")
411
+
412
+ def decide(row):
413
+ occ = row.get("avg_occupancy")
414
+ sent = row.get("avg_sentiment")
415
+ neg_share = row.get("negative_share")
416
+ cancel = row.get("avg_cancellation")
417
+
418
+ occ = occ if pd.notna(occ) else None
419
+ sent = sent if pd.notna(sent) else None
420
+ neg_share = neg_share if pd.notna(neg_share) else None
421
+ cancel = cancel if pd.notna(cancel) else None
422
+
423
+ if occ is not None and sent is not None and cancel is not None:
424
+ if occ >= 0.80 and sent >= 0.03 and cancel <= 0.15:
425
+ return "Raise price", "Strong demand and healthy guest perception support a measured increase."
426
+ if occ >= 0.60 and sent >= 0.0 and cancel <= 0.22:
427
+ return "Hold price", "Performance is stable. Maintain price and continue monitoring service quality."
428
+ if sent < 0.0 or (neg_share is not None and neg_share > 0.35) or cancel > 0.25:
429
+ return "Lower price / fix service", "Guest perception or cancellations are too weak to support a higher price."
430
+ return "Hold and monitor", "Signals are mixed. Avoid aggressive changes until performance stabilises."
431
+
432
+ if sent is not None:
433
+ if sent >= 0.03:
434
+ return "Hold or test small increase", "Sentiment is supportive, but operational data is incomplete."
435
+ if sent < 0:
436
+ return "Avoid increase", "Sentiment is weak, so price increases would be risky."
437
+ return "Insufficient data", "More pricing or occupancy data is needed for a confident decision."
438
+
439
+ actions = merged.apply(lambda row: decide(row), axis=1)
440
+ merged["pricing_action"] = actions.apply(lambda x: x[0])
441
+ merged["rationale"] = actions.apply(lambda x: x[1])
442
+
443
+ preferred_cols = join_keys + [
444
+ "avg_price", "avg_occupancy", "avg_cancellation", "total_revenue",
445
+ "avg_rating", "avg_sentiment", "negative_share", "priority_issue",
446
+ "pricing_action", "rationale"
447
+ ]
448
+ preferred_cols = [c for c in preferred_cols if c in merged.columns]
449
 
450
+ sort_cols = [c for c in ["avg_occupancy", "avg_sentiment"] if c in merged.columns]
451
+ if sort_cols:
452
+ merged = merged[preferred_cols].sort_values(by=sort_cols, ascending=False)
453
+ else:
454
+ merged = merged[preferred_cols]
455
 
456
+ return merged
457
 
458
+ # =========================================================
459
+ # CHARTS
460
+ # =========================================================
 
 
 
 
 
 
 
 
 
 
 
461
 
462
+ def chart_sentiment_distribution(review_summary):
463
+ counts = review_summary["sentiment_counts"]
464
+ df = pd.DataFrame({
465
+ "sentiment": list(counts.keys()),
466
+ "count": list(counts.values())
467
+ })
468
+ if df["count"].sum() == 0:
469
+ return empty_figure("Review Sentiment Distribution")
470
+ fig = px.bar(
471
+ df,
472
+ x="sentiment",
473
+ y="count",
474
+ color="sentiment",
475
+ color_discrete_map={
476
+ "positive": "#2fbf9f",
477
+ "neutral": "#f2b138",
478
+ "negative": "#e05b77",
479
+ },
480
+ title="Review Sentiment Distribution"
481
+ )
482
+ fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420, showlegend=False)
483
+ return fig
484
 
485
+ def chart_top_themes(review_summary):
486
+ top_themes = review_summary["top_themes"]
487
+ if not top_themes:
488
+ return empty_figure("Top Review Themes")
489
+ df = pd.DataFrame({
490
+ "theme": list(top_themes.keys()),
491
+ "count": list(top_themes.values())
492
+ }).sort_values("count", ascending=True)
493
+ fig = px.bar(
494
+ df,
495
+ x="count",
496
+ y="theme",
497
+ orientation="h",
498
+ title="Top Review Themes",
499
+ color="count",
500
+ color_continuous_scale=["#c9bdf5", "#5f44cc"]
501
+ )
502
+ fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420)
503
+ return fig
504
 
505
+ def chart_rating_by_city(review_summary):
506
+ city_table = review_summary["city_table"]
507
+ if city_table is None or city_table.empty:
508
+ return empty_figure("Average Rating by City", "City and rating data not available")
509
+ city_col = city_table.columns[0]
510
+ fig = px.bar(
511
+ city_table.sort_values("average_rating", ascending=False),
512
+ x=city_col,
513
+ y="average_rating",
514
+ title="Average Rating by City",
515
+ color="average_rating",
516
+ color_continuous_scale=["#f2d57d", "#35208a"]
517
+ )
518
+ fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420)
519
+ return fig
520
 
521
+ def chart_price_by_city(business_summary):
522
+ df = business_summary["full_df"].copy()
523
+ if "city" not in df.columns or "nightly_price_num" not in df.columns or df["nightly_price_num"].notna().sum() == 0:
524
+ return empty_figure("Average Nightly Price by City", "Pricing data not available")
525
+ chart_df = df.groupby("city", dropna=False)["nightly_price_num"].mean().reset_index()
526
+ fig = px.bar(
527
+ chart_df.sort_values("nightly_price_num", ascending=False),
528
+ x="city",
529
+ y="nightly_price_num",
530
+ title="Average Nightly Price by City",
531
+ color="nightly_price_num",
532
+ color_continuous_scale=["#d5cdf8", "#35208a"]
533
+ )
534
+ fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420)
535
+ return fig
536
 
537
+ def chart_occupancy_by_room_type(business_summary):
538
+ df = business_summary["full_df"].copy()
539
+ if "room_type" not in df.columns or "occupancy_rate_num" not in df.columns or df["occupancy_rate_num"].notna().sum() == 0:
540
+ return empty_figure("Occupancy by Room Type", "Occupancy data not available")
541
+ chart_df = df.groupby("room_type", dropna=False)["occupancy_rate_num"].mean().reset_index()
542
+ fig = px.bar(
543
+ chart_df.sort_values("occupancy_rate_num", ascending=False),
544
+ x="room_type",
545
+ y="occupancy_rate_num",
546
+ title="Average Occupancy by Room Type",
547
+ color="occupancy_rate_num",
548
+ color_continuous_scale=["#d2f2ea", "#2fbf9f"]
549
+ )
550
+ fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420)
551
+ fig.update_yaxes(tickformat=".0%")
552
+ return fig
553
 
554
+ def chart_revenue_by_city(business_summary):
555
+ df = business_summary["full_df"].copy()
556
+ if "city" not in df.columns or "revenue_num" not in df.columns or df["revenue_num"].notna().sum() == 0:
557
+ return empty_figure("Revenue by City", "Revenue data not available")
558
+ chart_df = df.groupby("city", dropna=False)["revenue_num"].sum().reset_index()
559
+ fig = px.bar(
560
+ chart_df.sort_values("revenue_num", ascending=False),
561
+ x="city",
562
+ y="revenue_num",
563
+ title="Total Revenue by City",
564
+ color="revenue_num",
565
+ color_continuous_scale=["#f6d39a", "#f2b138"]
566
+ )
567
+ fig.update_layout(template="plotly_white", paper_bgcolor="rgba(255,255,255,0.95)", height=420)
568
+ return fig
569
 
570
+ # =========================================================
571
+ # TEXT OUTPUTS
572
+ # =========================================================
573
 
574
+ def build_kpi_cards(review_summary, business_summary, pricing_df):
575
+ cards = []
576
+
577
+ cards.append(("Reviews", format_num(review_summary["review_count"])))
578
+ cards.append(("Avg Rating", format_num(review_summary["avg_rating"])))
579
+ cards.append(("Avg Nightly Price", format_num(business_summary["avg_price"])))
580
+ cards.append(("Avg Occupancy", format_pct(business_summary["avg_occupancy"])))
581
+ cards.append(("Avg Cancellation", format_pct(business_summary["avg_cancellation"])))
582
+ cards.append(("Total Revenue", format_num(business_summary["total_revenue"])))
583
+
584
+ raise_count = 0
585
+ if not pricing_df.empty and "pricing_action" in pricing_df.columns:
586
+ raise_count = int((pricing_df["pricing_action"] == "Raise price").sum())
587
+ cards.append(("Raise-Price Opportunities", format_num(raise_count)))
588
+
589
+ html = '<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(150px,1fr));gap:12px;margin-bottom:16px;">'
590
+ for label, value in cards:
591
+ html += f"""
592
+ <div style="
593
+ background:rgba(255,255,255,0.80);
594
+ border-radius:18px;
595
+ padding:18px;
596
+ border:1.5px solid rgba(255,255,255,0.80);
597
+ box-shadow:0 8px 24px rgba(53,32,138,0.08);
598
+ text-align:center;
599
+ ">
600
+ <div style="font-size:12px;font-weight:800;letter-spacing:1px;text-transform:uppercase;color:#6f5cb5;margin-bottom:8px;">{label}</div>
601
+ <div style="font-size:22px;font-weight:900;color:#24115e;">{value}</div>
602
+ </div>
603
+ """
604
+ html += "</div>"
605
+ return html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
 
607
+ def build_review_summary_md(review_summary):
608
+ sentiments = review_summary["sentiment_counts"]
609
+ top_negative = ", ".join(list(review_summary["top_negative_themes"].keys())[:3]) or "none detected"
610
+ top_positive = ", ".join(list(review_summary["top_positive_themes"].keys())[:3]) or "none detected"
 
 
 
 
 
 
 
 
611
 
612
+ md = f"""
613
+ ### Review Insights Summary
 
 
 
 
614
 
615
+ - **Total reviews analysed:** {review_summary['review_count']}
616
+ - **Average rating:** {format_num(review_summary['avg_rating'])}
617
+ - **Positive reviews:** {sentiments.get('positive', 0)}
618
+ - **Neutral reviews:** {sentiments.get('neutral', 0)}
619
+ - **Negative reviews:** {sentiments.get('negative', 0)}
620
+ - **Top positive themes:** {top_positive}
621
+ - **Top complaint themes:** {top_negative}
622
 
623
+ **Strong positive example:**
624
+ {review_summary['positive_example'] or 'No clear positive example found.'}
625
 
626
+ **Strong negative example:**
627
+ {review_summary['negative_example'] or 'No clear negative example found.'}
628
+ """
629
+ return md
630
+
631
+ def build_business_summary_md(business_summary, pricing_df):
632
+ action_counts = {}
633
+ if not pricing_df.empty and "pricing_action" in pricing_df.columns:
634
+ action_counts = pricing_df["pricing_action"].value_counts().to_dict()
635
+
636
+ md = f"""
637
+ ### Pricing and Business Summary
638
+
639
+ - **Rows analysed in synthetic/business data:** {business_summary['row_count']}
640
+ - **Average nightly price:** {format_num(business_summary['avg_price'])}
641
+ - **Average occupancy rate:** {format_pct(business_summary['avg_occupancy'])}
642
+ - **Average cancellation rate:** {format_pct(business_summary['avg_cancellation'])}
643
+ - **Total revenue:** {format_num(business_summary['total_revenue'])}
644
+
645
+ **Pricing decision counts**
646
+ - Raise price: {action_counts.get('Raise price', 0)}
647
+ - Hold price: {action_counts.get('Hold price', 0)}
648
+ - Hold and monitor: {action_counts.get('Hold and monitor', 0)}
649
+ - Lower price / fix service: {action_counts.get('Lower price / fix service', 0)}
650
+ - Hold or test small increase: {action_counts.get('Hold or test small increase', 0)}
651
+ - Avoid increase: {action_counts.get('Avoid increase', 0)}
652
+ - Insufficient data: {action_counts.get('Insufficient data', 0)}
653
+
654
+ This dashboard is designed as a **case-study decision tool** for hotel management, not just a chart viewer.
655
+ """
656
+ return md
657
 
658
+ def build_execution_log(review_df, business_df, pricing_df):
659
+ review_cols = ", ".join(review_df.columns[:12])
660
+ business_cols = ", ".join(business_df.columns[:12])
661
 
662
+ log = f"""PROJECT PIPELINE COMPLETED
 
 
 
 
 
663
 
664
+ Step 1 - Real-world review file loaded successfully
665
+ Rows: {len(review_df)}
666
+ Columns detected: {review_cols}
 
 
 
 
667
 
668
+ Step 2 - Synthetic/business file loaded successfully
669
+ Rows: {len(business_df)}
670
+ Columns detected: {business_cols}
 
 
671
 
672
+ Step 3 - Review sentiment and theme analysis completed
 
 
 
 
673
 
674
+ Step 4 - Business KPI analysis completed
 
 
 
 
675
 
676
+ Step 5 - Pricing optimisation logic completed
677
+ Recommendation rows generated: {len(pricing_df)}
 
 
 
678
 
679
+ Status:
680
+ - Qualitative analysis: ready
681
+ - Quantitative analysis: ready
682
+ - Pricing recommendations: ready
683
+ - AI assistant context: ready
684
+ """
685
+ return log
686
 
687
+ # =========================================================
688
+ # MAIN PIPELINE
689
+ # =========================================================
690
+
691
+ def run_pipeline(real_file, synthetic_file):
692
+ if real_file is None or synthetic_file is None:
693
+ raise gr.Error("Please upload both Excel files before running the analysis.")
694
+
695
+ real_df = read_uploaded_excel(real_file)
696
+ synthetic_df = read_uploaded_excel(synthetic_file)
697
+
698
+ if real_df is None or synthetic_df is None:
699
+ raise gr.Error("Could not read one of the Excel files.")
700
+
701
+ review_summary = analyze_reviews(real_df)
702
+ business_summary = analyze_business(synthetic_df)
703
+ pricing_df = build_pricing_recommendations(review_summary, business_summary)
704
+
705
+ kpi_html = build_kpi_cards(review_summary, business_summary, pricing_df)
706
+ review_md = build_review_summary_md(review_summary)
707
+ business_md = build_business_summary_md(business_summary, pricing_df)
708
+ log_text = build_execution_log(real_df, synthetic_df, pricing_df)
709
+
710
+ analysis_state = {
711
+ "review_summary_text": review_md,
712
+ "business_summary_text": business_md,
713
+ "review_count": review_summary["review_count"],
714
+ "avg_rating": review_summary["avg_rating"],
715
+ "avg_price": business_summary["avg_price"],
716
+ "avg_occupancy": business_summary["avg_occupancy"],
717
+ "avg_cancellation": business_summary["avg_cancellation"],
718
+ "total_revenue": business_summary["total_revenue"],
719
+ "top_negative_themes": review_summary["top_negative_themes"],
720
+ "top_positive_themes": review_summary["top_positive_themes"],
721
+ "pricing_table": pricing_df.head(20).to_dict(orient="records"),
722
+ }
723
 
 
724
  return (
725
+ log_text,
726
+ review_summary["clean_df"],
727
+ business_summary["clean_df"],
728
+ kpi_html,
729
+ review_md,
730
+ business_md,
731
+ chart_sentiment_distribution(review_summary),
732
+ chart_top_themes(review_summary),
733
+ chart_rating_by_city(review_summary),
734
+ chart_price_by_city(business_summary),
735
+ chart_occupancy_by_room_type(business_summary),
736
+ chart_revenue_by_city(business_summary),
737
+ pricing_df.head(20),
738
+ analysis_state,
739
  )
740
 
 
741
  # =========================================================
742
+ # AI ASSISTANT
743
  # =========================================================
744
 
745
+ def keyword_ai_reply(question: str, analysis_state: dict) -> str:
746
+ q = question.lower()
747
+
748
+ if not analysis_state:
749
+ return "Please run the analysis first in the Pipeline Runner tab."
750
+
751
+ top_neg = analysis_state.get("top_negative_themes", {})
752
+ top_pos = analysis_state.get("top_positive_themes", {})
753
+ pricing_table = analysis_state.get("pricing_table", [])
754
+
755
+ if "complaint" in q or "negative" in q or "problem" in q:
756
+ if top_neg:
757
+ top_items = ", ".join([f"{k} ({v})" for k, v in list(top_neg.items())[:3]])
758
+ return f"The main guest complaint themes are: {top_items}. These issues are likely weakening pricing power."
759
+ return "No strong complaint pattern was detected."
760
+
761
+ if "positive" in q or "praise" in q or "strength" in q:
762
+ if top_pos:
763
+ top_items = ", ".join([f"{k} ({v})" for k, v in list(top_pos.items())[:3]])
764
+ return f"The strongest positive themes are: {top_items}. These are service strengths the hotel can protect and highlight."
765
+ return "No strong praise pattern was detected."
766
+
767
+ if "occupancy" in q:
768
+ return f"The average occupancy rate in the uploaded synthetic/business dataset is {format_pct(analysis_state.get('avg_occupancy'))}."
769
+
770
+ if "cancel" in q:
771
+ return f"The average cancellation rate is {format_pct(analysis_state.get('avg_cancellation'))}. Higher cancellations make aggressive pricing riskier."
772
+
773
+ if "price" in q or "pricing" in q:
774
+ if pricing_table:
775
+ top = pricing_table[0]
776
+ location_bits = []
777
+ for key in ["hotel_name", "city", "portfolio"]:
778
+ if key in top:
779
+ location_bits.append(str(top[key]))
780
+ where = " / ".join(location_bits) if location_bits else "the top segment"
781
+ action = top.get("pricing_action", "review the segment")
782
+ rationale = top.get("rationale", "")
783
+ return f"The strongest current pricing recommendation is **{action}** for **{where}**. Reason: {rationale}"
784
+ return "I do not have pricing recommendations yet. Please run the analysis first."
785
+
786
+ if "summary" in q or "overview" in q:
787
  return (
788
+ f"Overview: {analysis_state.get('review_count', 'N/A')} reviews were analysed with an average rating of "
789
+ f"{format_num(analysis_state.get('avg_rating'))}. The synthetic/business dataset shows an average nightly price of "
790
+ f"{format_num(analysis_state.get('avg_price'))}, average occupancy of {format_pct(analysis_state.get('avg_occupancy'))}, "
791
+ f"and average cancellation of {format_pct(analysis_state.get('avg_cancellation'))}."
 
 
 
 
 
 
792
  )
793
 
794
+ return (
795
+ "I can answer questions about complaints, positive themes, pricing, occupancy, cancellations, and overall summary. "
796
+ "Try asking: 'What are the main complaints?' or 'Where should prices be raised?'"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
797
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
798
 
799
+ def build_llm_prompt(question: str, analysis_state: dict) -> str:
800
+ return f"""
801
+ You are an AI hotel pricing analyst. Answer briefly and clearly in business language.
802
 
803
+ Project context:
804
+ - Goal: optimise hotel room pricing while protecting guest satisfaction.
805
+ - This app uses real review data plus synthetic/business data.
806
+ - The output should feel like a consulting-style case study.
807
 
808
+ Review summary:
809
+ {analysis_state.get('review_summary_text', '')}
810
 
811
+ Business summary:
812
+ {analysis_state.get('business_summary_text', '')}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
813
 
814
+ Top pricing rows:
815
+ {json.dumps(analysis_state.get('pricing_table', [])[:5], indent=2)}
816
 
817
+ User question:
818
+ {question}
 
 
 
 
 
 
 
 
 
819
 
820
+ Instructions:
821
+ - Give a direct answer.
822
+ - Mention pricing implications when relevant.
823
+ - Be concise.
824
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
825
 
826
+ def call_n8n(question: str, analysis_state: dict):
827
+ if not N8N_WEBHOOK_URL:
828
+ return None
829
+ try:
830
+ import requests
831
+ payload = {
832
+ "question": question,
833
+ "analysis_state": analysis_state,
834
+ }
835
+ response = requests.post(N8N_WEBHOOK_URL, json=payload, timeout=25)
836
+ response.raise_for_status()
837
+ data = response.json()
838
+ return data.get("answer", "n8n responded but did not return an answer field.")
839
+ except Exception as e:
840
+ return f"n8n connection error: {e}"
841
 
842
+ def ask_ai(question, history, analysis_state):
843
+ if not question or not question.strip():
844
+ return history, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
845
 
846
+ history = history or []
847
 
848
+ if not analysis_state:
849
+ answer = "Please upload both files and run the analysis first in the Pipeline Runner tab."
850
+ else:
851
+ n8n_answer = call_n8n(question, analysis_state)
852
+ if n8n_answer:
853
+ answer = n8n_answer
854
+ elif LLM_ENABLED:
855
+ try:
856
+ prompt = build_llm_prompt(question, analysis_state)
857
+ completion = llm_client.chat_completion(
858
+ model=MODEL_NAME,
859
+ messages=[
860
+ {"role": "system", "content": "You are a concise hotel pricing analyst."},
861
+ {"role": "user", "content": prompt},
862
+ ],
863
+ max_tokens=350,
864
+ temperature=0.2,
865
+ )
866
+ if isinstance(completion, dict):
867
+ answer = completion["choices"][0]["message"]["content"]
868
+ else:
869
+ answer = completion.choices[0].message.content
870
+ except Exception as e:
871
+ answer = f"LLM error: {e}. Falling back to built-in assistant.\n\n" + keyword_ai_reply(question, analysis_state)
872
+ else:
873
+ answer = keyword_ai_reply(question, analysis_state)
874
 
875
+ history = history + [
876
+ {"role": "user", "content": question},
877
+ {"role": "assistant", "content": answer},
878
+ ]
879
+ return history, ""
880
 
881
  # =========================================================
882
  # UI
883
  # =========================================================
884
 
885
+ placeholder_kpis = """
886
+ <div style="background:rgba(255,255,255,0.78);padding:18px;border-radius:18px;border:1px solid rgba(255,255,255,0.7);text-align:center;">
887
+ <div style="font-size:22px;font-weight:900;color:#24115e;">Run the pipeline after uploading both Excel files</div>
888
+ <div style="margin-top:8px;color:#6f5cb5;">The dashboard, pricing recommendations, and AI assistant will populate automatically.</div>
889
+ </div>
890
+ """
891
 
892
+ with gr.Blocks(title="AI Hotel Pricing Optimizer", css=load_css()) as demo:
893
+ analysis_state = gr.State({})
894
 
895
  gr.Markdown(
896
+ "# AI-Powered Hotel Pricing Optimization and Guest Experience Analyzer\n"
897
+ "*Case-study tool for using real hotel reviews and synthetic business data to support pricing decisions.*",
898
  elem_id="escp_title",
899
  )
900
 
 
 
 
901
  with gr.Tab("Pipeline Runner"):
902
+ gr.Markdown(
903
+ """
904
+ ### Project Goal
905
+ This app helps a luxury hotel chain decide where to **raise, hold, or lower prices**
906
+ while protecting guest satisfaction. It combines:
907
+
908
+ - **Real-world review analysis** for qualitative insight
909
+ - **Synthetic/business data analysis** for quantitative insight
910
+ - **Pricing recommendations** for management decision support
911
+ """
912
+ )
913
 
914
  with gr.Row():
915
+ real_file = gr.File(label="Upload real reviews Excel file", file_types=[".xlsx"])
916
+ synthetic_file = gr.File(label="Upload synthetic/business Excel file", file_types=[".xlsx"])
917
 
918
+ run_button = gr.Button("Run Full Hotel Pricing Analysis", variant="primary")
919
+ run_log = gr.Textbox(label="Execution Log", lines=16, interactive=False)
 
 
 
 
920
 
921
+ with gr.Row():
922
+ reviews_preview = gr.Dataframe(label="Real Reviews Preview", interactive=False)
923
+ business_preview = gr.Dataframe(label="Synthetic/Business Preview", interactive=False)
924
 
 
 
 
925
  with gr.Tab("Dashboard"):
926
+ kpi_html = gr.HTML(value=placeholder_kpis)
 
 
927
 
928
+ with gr.Row():
929
+ review_summary_md = gr.Markdown("Run the pipeline to generate the review summary.")
930
+ business_summary_md = gr.Markdown("Run the pipeline to generate the business summary.")
 
931
 
932
+ gr.Markdown("### Review Analysis")
933
+ with gr.Row():
934
+ sentiment_chart = gr.Plot(label="Sentiment Distribution")
935
+ theme_chart = gr.Plot(label="Top Review Themes")
 
 
 
936
 
937
+ with gr.Row():
938
+ rating_city_chart = gr.Plot(label="Average Rating by City")
939
+ price_city_chart = gr.Plot(label="Average Price by City")
 
 
 
 
 
 
 
940
 
941
+ with gr.Row():
942
+ occupancy_chart = gr.Plot(label="Occupancy by Room Type")
943
+ revenue_chart = gr.Plot(label="Revenue by City")
 
944
 
945
+ gr.Markdown("### Pricing Recommendations")
946
+ pricing_table = gr.Dataframe(label="Top Pricing Decisions", interactive=False)
 
 
 
 
 
 
 
 
947
 
 
 
 
948
  with gr.Tab('"AI" Dashboard'):
949
+ ai_status = (
950
+ "Connected to **n8n**." if N8N_WEBHOOK_URL
951
+ else "Connected to an **LLM**." if LLM_ENABLED
952
+ else "Using the built-in **rule-based assistant**. You can later upgrade this by adding `HF_API_KEY` or `N8N_WEBHOOK_URL` as Space secrets."
 
 
953
  )
954
+
955
  gr.Markdown(
956
+ f"""
957
+ ### Ask the Hotel Pricing Assistant
958
+ {ai_status}
959
+
960
+ Example questions:
961
+ - What are the main complaints?
962
+ - Where should prices be raised?
963
+ - What should management fix first?
964
+ - Give me a summary of the business situation.
965
+ """
966
  )
967
 
968
+ chatbot = gr.Chatbot(label="Conversation", height=420, type="messages")
969
+ ai_input = gr.Textbox(
970
+ label="Ask about your uploaded data",
971
+ placeholder="e.g. Where should prices be lowered?",
972
+ lines=1,
973
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
974
 
975
+ ai_input.submit(
976
+ ask_ai,
977
+ inputs=[ai_input, chatbot, analysis_state],
978
+ outputs=[chatbot, ai_input],
979
  )
980
 
981
+ run_button.click(
982
+ run_pipeline,
983
+ inputs=[real_file, synthetic_file],
984
+ outputs=[
985
+ run_log,
986
+ reviews_preview,
987
+ business_preview,
988
+ kpi_html,
989
+ review_summary_md,
990
+ business_summary_md,
991
+ sentiment_chart,
992
+ theme_chart,
993
+ rating_city_chart,
994
+ price_city_chart,
995
+ occupancy_chart,
996
+ revenue_chart,
997
+ pricing_table,
998
+ analysis_state,
999
+ ],
1000
+ )
1001
 
1002
+ demo.launch(allowed_paths=[str(BASE_DIR)])