Yulu1 commited on
Commit
50e1016
·
verified ·
1 Parent(s): e65f8e9

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -621
app.py DELETED
@@ -1,621 +0,0 @@
1
- import os
2
- import re
3
- import json
4
- import time
5
- import traceback
6
- from pathlib import Path
7
- from typing import Dict, Any, List, Optional, Tuple
8
-
9
- import pandas as pd
10
- import gradio as gr
11
- import papermill as pm
12
-
13
- # Optional LLM (HuggingFace Inference API)
14
- try:
15
- from huggingface_hub import InferenceClient
16
- except Exception:
17
- InferenceClient = None
18
-
19
- # =========================================================
20
- # CONFIG
21
- # =========================================================
22
-
23
- BASE_DIR = Path(__file__).resolve().parent
24
-
25
- NB1 = os.environ.get("NB1", "pythonanalysis.ipynb").strip()
26
- NB2 = os.environ.get("NB2", "ranalysis.ipynb").strip()
27
-
28
- RUNS_DIR = BASE_DIR / "runs"
29
- ART_DIR = BASE_DIR / "artifacts"
30
- PY_FIG_DIR = ART_DIR / "py" / "figures"
31
- PY_TAB_DIR = ART_DIR / "py" / "tables"
32
- R_FIG_DIR = ART_DIR / "r" / "figures"
33
- R_TAB_DIR = ART_DIR / "r" / "tables"
34
-
35
- PAPERMILL_TIMEOUT = int(os.environ.get("PAPERMILL_TIMEOUT", "1800"))
36
- MAX_PREVIEW_ROWS = int(os.environ.get("MAX_FILE_PREVIEW_ROWS", "50"))
37
- MAX_LOG_CHARS = int(os.environ.get("MAX_LOG_CHARS", "8000"))
38
-
39
- HF_API_KEY = os.environ.get("HF_API_KEY", "").strip()
40
- MODEL_NAME = os.environ.get("MODEL_NAME", "deepseek-ai/DeepSeek-R1").strip()
41
- HF_PROVIDER = os.environ.get("HF_PROVIDER", "novita").strip()
42
-
43
- LLM_ENABLED = bool(HF_API_KEY) and InferenceClient is not None
44
- llm_client = (
45
- InferenceClient(provider=HF_PROVIDER, api_key=HF_API_KEY)
46
- if LLM_ENABLED
47
- else None
48
- )
49
-
50
- # =========================================================
51
- # HELPERS
52
- # =========================================================
53
-
54
- def ensure_dirs():
55
- for p in [RUNS_DIR, ART_DIR, PY_FIG_DIR, PY_TAB_DIR, R_FIG_DIR, R_TAB_DIR]:
56
- p.mkdir(parents=True, exist_ok=True)
57
-
58
- def stamp():
59
- return time.strftime("%Y%m%d-%H%M%S")
60
-
61
- def tail(text: str, n: int = MAX_LOG_CHARS) -> str:
62
- return (text or "")[-n:]
63
-
64
- def _ls(dir_path: Path, exts: Tuple[str, ...]) -> List[str]:
65
- if not dir_path.is_dir():
66
- return []
67
- return sorted(p.name for p in dir_path.iterdir() if p.is_file() and p.suffix.lower() in exts)
68
-
69
- def _read_csv(path: Path) -> pd.DataFrame:
70
- return pd.read_csv(path, nrows=MAX_PREVIEW_ROWS)
71
-
72
- def _read_json(path: Path):
73
- with path.open(encoding="utf-8") as f:
74
- return json.load(f)
75
-
76
- def artifacts_index() -> Dict[str, Any]:
77
- return {
78
- "python": {
79
- "figures": _ls(PY_FIG_DIR, (".png", ".jpg", ".jpeg")),
80
- "tables": _ls(PY_TAB_DIR, (".csv", ".json")),
81
- },
82
- "r": {
83
- "figures": _ls(R_FIG_DIR, (".png", ".jpg", ".jpeg")),
84
- "tables": _ls(R_TAB_DIR, (".csv", ".json")),
85
- },
86
- }
87
-
88
- # =========================================================
89
- # PIPELINE RUNNERS
90
- # =========================================================
91
-
92
- def run_notebook(nb_name: str) -> str:
93
- ensure_dirs()
94
- nb_in = BASE_DIR / nb_name
95
- if not nb_in.exists():
96
- return f"ERROR: {nb_name} not found."
97
- nb_out = RUNS_DIR / f"run_{stamp()}_{nb_name}"
98
- pm.execute_notebook(
99
- input_path=str(nb_in),
100
- output_path=str(nb_out),
101
- cwd=str(BASE_DIR),
102
- log_output=True,
103
- progress_bar=False,
104
- request_save_on_cell_execute=True,
105
- execution_timeout=PAPERMILL_TIMEOUT,
106
- )
107
- return f"Executed {nb_name}"
108
-
109
-
110
- def run_datacreation() -> str:
111
- try:
112
- log = run_notebook(NB1)
113
- csvs = [f.name for f in BASE_DIR.glob("*.csv")]
114
- return f"OK {log}\n\nCSVs now in /app:\n" + "\n".join(f" - {c}" for c in sorted(csvs))
115
- except Exception as e:
116
- return f"FAILED {e}\n\n{traceback.format_exc()[-2000:]}"
117
-
118
-
119
- def run_pythonanalysis() -> str:
120
- try:
121
- log = run_notebook(NB2)
122
- idx = artifacts_index()
123
- figs = idx["python"]["figures"]
124
- tabs = idx["python"]["tables"]
125
- return (
126
- f"OK {log}\n\n"
127
- f"Figures: {', '.join(figs) or '(none)'}\n"
128
- f"Tables: {', '.join(tabs) or '(none)'}"
129
- )
130
- except Exception as e:
131
- return f"FAILED {e}\n\n{traceback.format_exc()[-2000:]}"
132
-
133
-
134
- def run_r() -> str:
135
- try:
136
- log = run_notebook(NB3)
137
- idx = artifacts_index()
138
- figs = idx["r"]["figures"]
139
- tabs = idx["r"]["tables"]
140
- return (
141
- f"OK {log}\n\n"
142
- f"Figures: {', '.join(figs) or '(none)'}\n"
143
- f"Tables: {', '.join(tabs) or '(none)'}"
144
- )
145
- except Exception as e:
146
- return f"FAILED {e}\n\n{traceback.format_exc()[-2000:]}"
147
-
148
-
149
- def run_full_pipeline() -> str:
150
- logs = []
151
- logs.append("=" * 50)
152
- logs.append("STEP 1/3: Data Creation (web scraping + synthetic data)")
153
- logs.append("=" * 50)
154
- logs.append(run_datacreation())
155
- logs.append("")
156
- logs.append("=" * 50)
157
- logs.append("STEP 2/3: Python Analysis (sentiment, ARIMA, dashboard)")
158
- logs.append("=" * 50)
159
- logs.append(run_pythonanalysis())
160
- logs.append("")
161
- logs.append("=" * 50)
162
- logs.append("STEP 3/3: R Analysis (ETS/ARIMA forecasting)")
163
- logs.append("=" * 50)
164
- logs.append(run_r())
165
- return "\n".join(logs)
166
-
167
-
168
- # =========================================================
169
- # GALLERY LOADERS
170
- # =========================================================
171
-
172
- def _load_all_figures() -> List[Tuple[str, str]]:
173
- """Return list of (filepath, caption) for Gallery."""
174
- items = []
175
- for p in sorted(PY_FIG_DIR.glob("*.png")):
176
- items.append((str(p), f"Python | {p.stem.replace('_', ' ').title()}"))
177
- for p in sorted(R_FIG_DIR.glob("*.png")):
178
- items.append((str(p), f"R | {p.stem.replace('_', ' ').title()}"))
179
- return items
180
-
181
-
182
- def _load_table_safe(path: Path) -> pd.DataFrame:
183
- try:
184
- if path.suffix == ".json":
185
- obj = _read_json(path)
186
- if isinstance(obj, dict):
187
- return pd.DataFrame([obj])
188
- return pd.DataFrame(obj)
189
- return _read_csv(path)
190
- except Exception as e:
191
- return pd.DataFrame([{"error": str(e)}])
192
-
193
-
194
- def refresh_gallery():
195
- """Called when user clicks Refresh on Gallery tab."""
196
- figures = _load_all_figures()
197
- idx = artifacts_index()
198
-
199
- # Build table choices
200
- table_choices = []
201
- for scope in ("python", "r"):
202
- for name in idx[scope]["tables"]:
203
- table_choices.append(f"{scope}/{name}")
204
-
205
- # Default: show first table if available
206
- default_df = pd.DataFrame()
207
- if table_choices:
208
- parts = table_choices[0].split("/", 1)
209
- base = PY_TAB_DIR if parts[0] == "python" else R_TAB_DIR
210
- default_df = _load_table_safe(base / parts[1])
211
-
212
- return (
213
- figures if figures else [],
214
- gr.update(choices=table_choices, value=table_choices[0] if table_choices else None),
215
- default_df,
216
- )
217
-
218
-
219
- def on_table_select(choice: str):
220
- if not choice or "/" not in choice:
221
- return pd.DataFrame([{"hint": "Select a table above."}])
222
- scope, name = choice.split("/", 1)
223
- base = {"python": PY_TAB_DIR, "r": R_TAB_DIR}.get(scope)
224
- if not base:
225
- return pd.DataFrame([{"error": f"Unknown scope: {scope}"}])
226
- path = base / name
227
- if not path.exists():
228
- return pd.DataFrame([{"error": f"File not found: {path}"}])
229
- return _load_table_safe(path)
230
-
231
-
232
- # =========================================================
233
- # KPI LOADER
234
- # =========================================================
235
-
236
- def load_kpis() -> Dict[str, Any]:
237
- for candidate in [PY_TAB_DIR / "kpis.json", PY_FIG_DIR / "kpis.json"]:
238
- if candidate.exists():
239
- try:
240
- return _read_json(candidate)
241
- except Exception:
242
- pass
243
- return {}
244
-
245
-
246
- # =========================================================
247
- # AI DASHBOARD (Tab 3) -- LLM picks what to display
248
- # =========================================================
249
-
250
- DASHBOARD_SYSTEM = """You are an AI dashboard assistant for a book-sales analytics app.
251
- The user asks questions or requests about their data. You have access to pre-computed
252
- artifacts from Python and R analysis pipelines.
253
-
254
- AVAILABLE ARTIFACTS (only reference ones that exist):
255
- {artifacts_json}
256
-
257
- KPI SUMMARY: {kpis_json}
258
-
259
- YOUR JOB:
260
- 1. Answer the user's question conversationally using the KPIs and your knowledge of the artifacts.
261
- 2. At the END of your response, output a JSON block (fenced with ```json ... ```) that tells
262
- the dashboard which artifact to display. The JSON must have this shape:
263
- {{"show": "figure"|"table"|"none", "scope": "python"|"r", "filename": "..."}}
264
-
265
- - Use "show": "figure" to display a chart image.
266
- - Use "show": "table" to display a CSV/JSON table.
267
- - Use "show": "none" if no artifact is relevant.
268
-
269
- RULES:
270
- - If the user asks about sales trends or forecasting by title, show sales_trends or arima figures.
271
- - If the user asks about sentiment, show sentiment figure or sentiment_counts table.
272
- - If the user asks about R regression, the R notebook focuses on forecasting, show accuracy_table.csv.
273
- - If the user asks about forecast accuracy or model comparison, show accuracy_table.csv or forecast_compare.png.
274
- - If the user asks about top sellers, show top_titles_by_units_sold.csv.
275
- - If the user asks a general data question, pick the most relevant artifact.
276
- - Keep your answer concise (2-4 sentences), then the JSON block.
277
- """
278
-
279
- JSON_BLOCK_RE = re.compile(r"```json\s*(\{.*?\})\s*```", re.DOTALL)
280
- FALLBACK_JSON_RE = re.compile(r"\{[^{}]*\"show\"[^{}]*\}", re.DOTALL)
281
-
282
-
283
- def _parse_display_directive(text: str) -> Dict[str, str]:
284
- m = JSON_BLOCK_RE.search(text)
285
- if m:
286
- try:
287
- return json.loads(m.group(1))
288
- except json.JSONDecodeError:
289
- pass
290
- m = FALLBACK_JSON_RE.search(text)
291
- if m:
292
- try:
293
- return json.loads(m.group(0))
294
- except json.JSONDecodeError:
295
- pass
296
- return {"show": "none"}
297
-
298
-
299
- def _clean_response(text: str) -> str:
300
- """Strip the JSON directive block from the displayed response."""
301
- return JSON_BLOCK_RE.sub("", text).strip()
302
-
303
-
304
- def ai_chat(user_msg: str, history: list):
305
- """Chat function for the AI Dashboard tab."""
306
- if not user_msg or not user_msg.strip():
307
- return history, "", None, None
308
-
309
- idx = artifacts_index()
310
- kpis = load_kpis()
311
-
312
- if not LLM_ENABLED:
313
- reply, directive = _keyword_fallback(user_msg, idx, kpis)
314
- else:
315
- system = DASHBOARD_SYSTEM.format(
316
- artifacts_json=json.dumps(idx, indent=2),
317
- kpis_json=json.dumps(kpis, indent=2) if kpis else "(no KPIs yet, run the pipeline first)",
318
- )
319
- msgs = [{"role": "system", "content": system}]
320
- for entry in (history or [])[-6:]:
321
- msgs.append(entry)
322
- msgs.append({"role": "user", "content": user_msg})
323
-
324
- try:
325
- r = llm_client.chat_completion(
326
- model=MODEL_NAME,
327
- messages=msgs,
328
- temperature=0.3,
329
- max_tokens=600,
330
- stream=False,
331
- )
332
- raw = (
333
- r["choices"][0]["message"]["content"]
334
- if isinstance(r, dict)
335
- else r.choices[0].message.content
336
- )
337
- directive = _parse_display_directive(raw)
338
- reply = _clean_response(raw)
339
- except Exception as e:
340
- reply = f"LLM error: {e}. Falling back to keyword matching."
341
- reply_fb, directive = _keyword_fallback(user_msg, idx, kpis)
342
- reply += "\n\n" + reply_fb
343
-
344
- # Resolve artifact paths
345
- fig_out = None
346
- tab_out = None
347
- show = directive.get("show", "none")
348
- scope = directive.get("scope", "")
349
- fname = directive.get("filename", "")
350
-
351
- if show == "figure" and scope and fname:
352
- base = {"python": PY_FIG_DIR, "r": R_FIG_DIR}.get(scope)
353
- if base and (base / fname).exists():
354
- fig_out = str(base / fname)
355
- else:
356
- reply += f"\n\n*(Could not find figure: {scope}/{fname})*"
357
-
358
- if show == "table" and scope and fname:
359
- base = {"python": PY_TAB_DIR, "r": R_TAB_DIR}.get(scope)
360
- if base and (base / fname).exists():
361
- tab_out = _load_table_safe(base / fname)
362
- else:
363
- reply += f"\n\n*(Could not find table: {scope}/{fname})*"
364
-
365
- new_history = (history or []) + [
366
- {"role": "user", "content": user_msg},
367
- {"role": "assistant", "content": reply},
368
- ]
369
-
370
- return new_history, "", fig_out, tab_out
371
-
372
-
373
- def _keyword_fallback(msg: str, idx: Dict, kpis: Dict) -> Tuple[str, Dict]:
374
- """Simple keyword matcher when LLM is unavailable."""
375
- msg_lower = msg.lower()
376
-
377
- if not any(idx[s]["figures"] or idx[s]["tables"] for s in ("python", "r")):
378
- return (
379
- "No artifacts found yet. Please run the pipeline first (Tab 1), "
380
- "then come back here to explore the results.",
381
- {"show": "none"},
382
- )
383
-
384
- kpi_text = ""
385
- if kpis:
386
- total = kpis.get("total_units_sold", 0)
387
- kpi_text = (
388
- f"Quick summary: **{kpis.get('n_titles', '?')}** book titles across "
389
- f"**{kpis.get('n_months', '?')}** months, with **{total:,.0f}** total units sold."
390
- )
391
-
392
- if any(w in msg_lower for w in ["trend", "sales trend", "monthly sale"]):
393
- return (
394
- f"Here are the sales trends for sampled titles. {kpi_text}",
395
- {"show": "figure", "scope": "python", "filename": "sales_trends_sampled_titles.png"},
396
- )
397
-
398
- if any(w in msg_lower for w in ["sentiment", "review", "positive", "negative"]):
399
- return (
400
- f"Here is the sentiment distribution across sampled book titles. {kpi_text}",
401
- {"show": "figure", "scope": "python", "filename": "sentiment_distribution_sampled_titles.png"},
402
- )
403
-
404
- if any(w in msg_lower for w in ["arima", "forecast", "predict"]):
405
- if "compar" in msg_lower or "ets" in msg_lower or "accuracy" in msg_lower:
406
- if "forecast_compare.png" in idx.get("r", {}).get("figures", []):
407
- return (
408
- "Here is the ARIMA+Fourier vs ETS forecast comparison from the R analysis.",
409
- {"show": "figure", "scope": "r", "filename": "forecast_compare.png"},
410
- )
411
- return (
412
- f"Here are the ARIMA forecasts for sampled titles from the Python analysis. {kpi_text}",
413
- {"show": "figure", "scope": "python", "filename": "arima_forecasts_sampled_titles.png"},
414
- )
415
-
416
- if any(w in msg_lower for w in ["regression", "lm", "coefficient", "price effect", "rating effect"]):
417
- return (
418
- "The R notebook focuses on forecasting rather than regression. "
419
- "Here is the forecast accuracy comparison instead.",
420
- {"show": "table", "scope": "r", "filename": "accuracy_table.csv"},
421
- )
422
-
423
- if any(w in msg_lower for w in ["top", "best sell", "popular", "rank"]):
424
- return (
425
- f"Here are the top-selling titles by units sold. {kpi_text}",
426
- {"show": "table", "scope": "python", "filename": "top_titles_by_units_sold.csv"},
427
- )
428
-
429
- if any(w in msg_lower for w in ["accuracy", "benchmark", "rmse", "mape"]):
430
- return (
431
- "Here is the forecast accuracy comparison (ARIMA+Fourier vs ETS) from the R analysis.",
432
- {"show": "table", "scope": "r", "filename": "accuracy_table.csv"},
433
- )
434
-
435
- if any(w in msg_lower for w in ["r analysis", "r output", "r result"]):
436
- if "forecast_compare.png" in idx.get("r", {}).get("figures", []):
437
- return (
438
- "Here is the main R output: forecast model comparison plot.",
439
- {"show": "figure", "scope": "r", "filename": "forecast_compare.png"},
440
- )
441
-
442
- if any(w in msg_lower for w in ["dashboard", "overview", "summary", "kpi"]):
443
- return (
444
- f"Dashboard overview: {kpi_text}\n\nAsk me about sales trends, sentiment, forecasts, "
445
- "forecast accuracy, or top sellers to see specific visualizations.",
446
- {"show": "table", "scope": "python", "filename": "df_dashboard.csv"},
447
- )
448
-
449
- # Default
450
- return (
451
- f"I can show you various analyses. {kpi_text}\n\n"
452
- "Try asking about: **sales trends**, **sentiment**, **ARIMA forecasts**, "
453
- "**forecast accuracy**, **top sellers**, or **dashboard overview**.",
454
- {"show": "none"},
455
- )
456
-
457
-
458
- # =========================================================
459
- # UI
460
- # =========================================================
461
-
462
- ensure_dirs()
463
-
464
- def load_css() -> str:
465
- css_path = BASE_DIR / "style.css"
466
- return css_path.read_text(encoding="utf-8") if css_path.exists() else ""
467
-
468
-
469
- with gr.Blocks(title="RX12 Workshop App") as demo:
470
-
471
- gr.Markdown(
472
- "# RX12 - Intro to Python and R - Workshop App\n"
473
- "*The app to integrate the three notebooks in to get a functioning blueprint of the group project's final product*",
474
- elem_id="escp_title",
475
- )
476
-
477
- # ===========================================================
478
- # TAB 1 -- Pipeline Runner
479
- # ===========================================================
480
- with gr.Tab("Pipeline Runner"):
481
- gr.Markdown(
482
- )
483
-
484
- with gr.Row():
485
- with gr.Column(scale=1):
486
- btn_nb1 = gr.Button(
487
- "Step 1: Data Creation",
488
- variant="secondary",
489
- )
490
- gr.Markdown(
491
- )
492
- with gr.Column(scale=1):
493
- btn_nb2 = gr.Button(
494
- "Step 2a: Python Analysis",
495
- variant="secondary",
496
- )
497
- gr.Markdown(
498
- )
499
- with gr.Column(scale=1):
500
- btn_r = gr.Button(
501
- "Step 2b: R Analysis",
502
- variant="secondary",
503
- )
504
- gr.Markdown(
505
- )
506
-
507
- with gr.Row():
508
- btn_all = gr.Button(
509
- "Run All 3 Steps",
510
- variant="primary",
511
- )
512
-
513
- run_log = gr.Textbox(
514
- label="Execution Log",
515
- lines=18,
516
- max_lines=30,
517
- interactive=False,
518
- )
519
-
520
- btn_nb1.click(run_datacreation, outputs=[run_log])
521
- btn_nb2.click(run_pythonanalysis, outputs=[run_log])
522
- btn_r.click(run_r, outputs=[run_log])
523
- btn_all.click(run_full_pipeline, outputs=[run_log])
524
-
525
- # ===========================================================
526
- # TAB 2 -- Results Gallery
527
- # ===========================================================
528
- with gr.Tab("Results Gallery"):
529
- gr.Markdown(
530
- "### All generated artifacts\n\n"
531
- "After running the pipeline, click **Refresh** to load all figures and tables. "
532
- "Figures are shown in the gallery; select a table from the dropdown to inspect it."
533
- )
534
-
535
- refresh_btn = gr.Button("Refresh Gallery", variant="primary")
536
-
537
- gr.Markdown("#### Figures")
538
- gallery = gr.Gallery(
539
- label="All Figures (Python + R)",
540
- columns=2,
541
- height=480,
542
- object_fit="contain",
543
- )
544
-
545
- gr.Markdown("#### Tables")
546
- table_dropdown = gr.Dropdown(
547
- label="Select a table to view",
548
- choices=[],
549
- interactive=True,
550
- )
551
- table_display = gr.Dataframe(
552
- label="Table Preview",
553
- interactive=False,
554
- )
555
-
556
- refresh_btn.click(
557
- refresh_gallery,
558
- outputs=[gallery, table_dropdown, table_display],
559
- )
560
- table_dropdown.change(
561
- on_table_select,
562
- inputs=[table_dropdown],
563
- outputs=[table_display],
564
- )
565
-
566
- # ===========================================================
567
- # TAB 3 -- AI Dashboard
568
- # ===========================================================
569
- with gr.Tab('"AI" Dashboard'):
570
- gr.Markdown(
571
- "### Ask questions, get visualisations\n\n"
572
- "Describe what you want to see and the AI will pick the right chart or table. "
573
- + (
574
- "*LLM is active.*"
575
- if LLM_ENABLED
576
- else "*No API key detected \u2014 using keyword matching. "
577
- "Set `HF_API_KEY` in Space secrets for full LLM support.*"
578
- )
579
- )
580
-
581
- with gr.Row(equal_height=True):
582
- with gr.Column(scale=1):
583
- chatbot = gr.Chatbot(
584
- label="Conversation",
585
- height=380,
586
- )
587
- user_input = gr.Textbox(
588
- label="Ask about your data",
589
- placeholder="e.g. Show me sales trends / What drives revenue? / Compare forecast models",
590
- lines=1,
591
- )
592
- gr.Examples(
593
- examples=[
594
- "Show me the sales trends",
595
- "What does the sentiment look like?",
596
- "Which titles sell the most?",
597
- "Show the forecast accuracy comparison",
598
- "Compare the ARIMA and ETS forecasts",
599
- "Give me a dashboard overview",
600
- ],
601
- inputs=user_input,
602
- )
603
-
604
- with gr.Column(scale=1):
605
- ai_figure = gr.Image(
606
- label="Visualisation",
607
- height=350,
608
- )
609
- ai_table = gr.Dataframe(
610
- label="Data Table",
611
- interactive=False,
612
- )
613
-
614
- user_input.submit(
615
- ai_chat,
616
- inputs=[user_input, chatbot],
617
- outputs=[chatbot, user_input, ai_figure, ai_table],
618
- )
619
-
620
-
621
- demo.launch(css=load_css(), allowed_paths=[str(BASE_DIR)])