XRachel commited on
Commit
6f46588
·
verified ·
1 Parent(s): 8ce3168

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -582
app.py DELETED
@@ -1,582 +0,0 @@
1
- import os
2
- import re
3
- import json
4
- import time
5
- import traceback
6
- from pathlib import Path
7
- from typing import Dict, Any, List, Optional, Tuple
8
-
9
- import pandas as pd
10
- import gradio as gr
11
- import papermill as pm
12
-
13
- # Optional LLM (HuggingFace Inference API)
14
- try:
15
- from huggingface_hub import InferenceClient
16
- except Exception:
17
- InferenceClient = None
18
-
19
- # =========================================================
20
- # CONFIG
21
- # =========================================================
22
-
23
- BASE_DIR = Path(__file__).resolve().parent
24
-
25
- NB1 = os.environ.get("NB1", "datacreation.ipynb").strip()
26
- NB2 = os.environ.get("NB2", "pythonanalysis.ipynb").strip()
27
- NB3 = os.environ.get("NB3", "ranalysis.ipynb").strip()
28
-
29
- RUNS_DIR = BASE_DIR / "runs"
30
- ART_DIR = BASE_DIR / "artifacts"
31
- PY_FIG_DIR = ART_DIR / "py" / "figures"
32
- PY_TAB_DIR = ART_DIR / "py" / "tables"
33
- R_FIG_DIR = ART_DIR / "r" / "figures"
34
- R_TAB_DIR = ART_DIR / "r" / "tables"
35
-
36
- PAPERMILL_TIMEOUT = int(os.environ.get("PAPERMILL_TIMEOUT", "1800"))
37
- MAX_PREVIEW_ROWS = int(os.environ.get("MAX_FILE_PREVIEW_ROWS", "50"))
38
- MAX_LOG_CHARS = int(os.environ.get("MAX_LOG_CHARS", "8000"))
39
-
40
- HF_API_KEY = os.environ.get("HF_API_KEY", "").strip()
41
- MODEL_NAME = os.environ.get("MODEL_NAME", "deepseek-ai/DeepSeek-R1").strip()
42
- HF_PROVIDER = os.environ.get("HF_PROVIDER", "novita").strip()
43
-
44
- LLM_ENABLED = bool(HF_API_KEY) and InferenceClient is not None
45
- llm_client = (
46
- InferenceClient(provider=HF_PROVIDER, api_key=HF_API_KEY)
47
- if LLM_ENABLED
48
- else None
49
- )
50
-
51
- # =========================================================
52
- # HELPERS
53
- # =========================================================
54
-
55
- def ensure_dirs():
56
- for p in [RUNS_DIR, ART_DIR, PY_FIG_DIR, PY_TAB_DIR, R_FIG_DIR, R_TAB_DIR]:
57
- p.mkdir(parents=True, exist_ok=True)
58
-
59
- def stamp():
60
- return time.strftime("%Y%m%d-%H%M%S")
61
-
62
- def tail(text: str, n: int = MAX_LOG_CHARS) -> str:
63
- return (text or "")[-n:]
64
-
65
- def _ls(dir_path: Path, exts: Tuple[str, ...]) -> List[str]:
66
- if not dir_path.is_dir():
67
- return []
68
- return sorted(p.name for p in dir_path.iterdir() if p.is_file() and p.suffix.lower() in exts)
69
-
70
- def _read_csv(path: Path) -> pd.DataFrame:
71
- return pd.read_csv(path, nrows=MAX_PREVIEW_ROWS)
72
-
73
- def _read_json(path: Path):
74
- with path.open(encoding="utf-8") as f:
75
- return json.load(f)
76
-
77
- def artifacts_index() -> Dict[str, Any]:
78
- return {
79
- "python": {
80
- "figures": _ls(PY_FIG_DIR, (".png", ".jpg", ".jpeg")),
81
- "tables": _ls(PY_TAB_DIR, (".csv", ".json")),
82
- },
83
- "r": {
84
- "figures": _ls(R_FIG_DIR, (".png", ".jpg", ".jpeg")),
85
- "tables": _ls(R_TAB_DIR, (".csv", ".json")),
86
- },
87
- }
88
-
89
- # =========================================================
90
- # PIPELINE RUNNERS
91
- # =========================================================
92
-
93
- def run_notebook(nb_name: str) -> str:
94
- ensure_dirs()
95
- nb_in = BASE_DIR / nb_name
96
- if not nb_in.exists():
97
- return f"ERROR: {nb_name} not found."
98
- nb_out = RUNS_DIR / f"run_{stamp()}_{nb_name}"
99
- pm.execute_notebook(
100
- input_path=str(nb_in),
101
- output_path=str(nb_out),
102
- cwd=str(BASE_DIR),
103
- log_output=True,
104
- progress_bar=False,
105
- request_save_on_cell_execute=True,
106
- execution_timeout=PAPERMILL_TIMEOUT,
107
- )
108
- return f"Executed {nb_name}"
109
-
110
- def run_datacreation() -> str:
111
- try:
112
- log = run_notebook(NB1)
113
- csvs = [f.name for f in BASE_DIR.glob("*.csv")]
114
- return f"OK {log}\n\nCSVs now in /app:\n" + "\n".join(f" - {c}" for c in sorted(csvs))
115
- except Exception as e:
116
- return f"FAILED {e}\n\n{traceback.format_exc()[-2000:]}"
117
-
118
- def run_pythonanalysis() -> str:
119
- try:
120
- log = run_notebook(NB2)
121
- idx = artifacts_index()
122
- figs = idx["python"]["figures"]
123
- tabs = idx["python"]["tables"]
124
- return (
125
- f"OK {log}\n\n"
126
- f"Figures: {', '.join(figs) or '(none)'}\n"
127
- f"Tables: {', '.join(tabs) or '(none)'}"
128
- )
129
- except Exception as e:
130
- return f"FAILED {e}\n\n{traceback.format_exc()[-2000:]}"
131
-
132
- def run_r() -> str:
133
- try:
134
- log = run_notebook(NB3)
135
- idx = artifacts_index()
136
- figs = idx["r"]["figures"]
137
- tabs = idx["r"]["tables"]
138
- return (
139
- f"OK {log}\n\n"
140
- f"Figures: {', '.join(figs) or '(none)'}\n"
141
- f"Tables: {', '.join(tabs) or '(none)'}"
142
- )
143
- except Exception as e:
144
- return f"FAILED {e}\n\n{traceback.format_exc()[-2000:]}"
145
-
146
- def run_full_pipeline() -> str:
147
- logs = []
148
- logs.append("=" * 50)
149
- logs.append("STEP 1/3: Data Creation (web scraping + synthetic data)")
150
- logs.append("=" * 50)
151
- logs.append(run_datacreation())
152
- logs.append("")
153
- logs.append("=" * 50)
154
- logs.append("STEP 2/3: Python Analysis (sentiment, ARIMA, dashboard)")
155
- logs.append("=" * 50)
156
- logs.append(run_pythonanalysis())
157
- logs.append("")
158
- logs.append("=" * 50)
159
- logs.append("STEP 3/3: R Analysis (ETS/ARIMA forecasting)")
160
- logs.append("=" * 50)
161
- logs.append(run_r())
162
- return "\n".join(logs)
163
-
164
- # =========================================================
165
- # GALLERY LOADERS
166
- # =========================================================
167
-
168
- def _load_all_figures() -> List[Tuple[str, str]]:
169
- """Return list of (filepath, caption) for Gallery."""
170
- items = []
171
- for p in sorted(PY_FIG_DIR.glob("*.png")):
172
- items.append((str(p), f"Python | {p.stem.replace('_', ' ').title()}"))
173
- for p in sorted(R_FIG_DIR.glob("*.png")):
174
- items.append((str(p), f"R | {p.stem.replace('_', ' ').title()}"))
175
- return items
176
-
177
- def _load_table_safe(path: Path) -> pd.DataFrame:
178
- try:
179
- if path.suffix == ".json":
180
- obj = _read_json(path)
181
- if isinstance(obj, dict):
182
- return pd.DataFrame([obj])
183
- return pd.DataFrame(obj)
184
- return _read_csv(path)
185
- except Exception as e:
186
- return pd.DataFrame([{"error": str(e)}])
187
-
188
- def refresh_gallery():
189
- """Called when user clicks Refresh on Gallery tab."""
190
- figures = _load_all_figures()
191
- idx = artifacts_index()
192
-
193
- table_choices = []
194
- for scope in ("python", "r"):
195
- for name in idx[scope]["tables"]:
196
- table_choices.append(f"{scope}/{name}")
197
-
198
- default_df = pd.DataFrame()
199
- if table_choices:
200
- scope, name = table_choices[0].split("/", 1)
201
- base = PY_TAB_DIR if scope == "python" else R_TAB_DIR
202
- default_df = _load_table_safe(base / name)
203
-
204
- return (
205
- figures if figures else [],
206
- gr.update(choices=table_choices, value=table_choices[0] if table_choices else None),
207
- default_df,
208
- )
209
-
210
- def on_table_select(choice: str):
211
- if not choice or "/" not in choice:
212
- return pd.DataFrame([{"hint": "Select a table above."}])
213
- scope, name = choice.split("/", 1)
214
- base = {"python": PY_TAB_DIR, "r": R_TAB_DIR}.get(scope)
215
- if not base:
216
- return pd.DataFrame([{"error": f"Unknown scope: {scope}"}])
217
- path = base / name
218
- if not path.exists():
219
- return pd.DataFrame([{"error": f"File not found: {path}"}])
220
- return _load_table_safe(path)
221
-
222
- # =========================================================
223
- # KPI LOADER
224
- # =========================================================
225
-
226
- def load_kpis() -> Dict[str, Any]:
227
- for candidate in [PY_TAB_DIR / "kpis.json", PY_FIG_DIR / "kpis.json"]:
228
- if candidate.exists():
229
- try:
230
- return _read_json(candidate)
231
- except Exception:
232
- pass
233
- return {}
234
-
235
- # =========================================================
236
- # AI DASHBOARD (Tab 3) -- LLM picks what to display
237
- # =========================================================
238
-
239
- DASHBOARD_SYSTEM = """You are an AI dashboard assistant for a book-sales analytics app.
240
- The user asks questions or requests about their data. You have access to pre-computed
241
- artifacts from Python and R analysis pipelines.
242
-
243
- AVAILABLE ARTIFACTS (only reference ones that exist):
244
- {artifacts_json}
245
-
246
- KPI SUMMARY: {kpis_json}
247
-
248
- YOUR JOB:
249
- 1. Answer the user's question conversationally using the KPIs and your knowledge of the artifacts.
250
- 2. At the END of your response, output a JSON block (fenced with ```json ... ```) that tells
251
- the dashboard which artifact to display. The JSON must have this shape:
252
- {{"show": "figure"|"table"|"none", "scope": "python"|"r", "filename": "..."}}
253
-
254
- - Use "show": "figure" to display a chart image.
255
- - Use "show": "table" to display a CSV/JSON table.
256
- - Use "show": "none" if no artifact is relevant.
257
-
258
- RULES:
259
- - If the user asks about sales trends or forecasting by title, show sales_trends or arima figures.
260
- - If the user asks about sentiment, show sentiment figure or sentiment_counts table.
261
- - If the user asks about R regression, the R notebook focuses on forecasting, show accuracy_table.csv.
262
- - If the user asks about forecast accuracy or model comparison, show accuracy_table.csv or forecast_compare.png.
263
- - If the user asks about top sellers, show top_titles_by_units_sold.csv.
264
- - If the user asks a general data question, pick the most relevant artifact.
265
- - Keep your answer concise (2-4 sentences), then the JSON block.
266
- """
267
-
268
- JSON_BLOCK_RE = re.compile(r"```json\s*(\{.*?\})\s*```", re.DOTALL)
269
- FALLBACK_JSON_RE = re.compile(r"\{[^{}]*\"show\"[^{}]*\}", re.DOTALL)
270
-
271
- def _parse_display_directive(text: str) -> Dict[str, str]:
272
- m = JSON_BLOCK_RE.search(text)
273
- if m:
274
- try:
275
- return json.loads(m.group(1))
276
- except json.JSONDecodeError:
277
- pass
278
- m = FALLBACK_JSON_RE.search(text)
279
- if m:
280
- try:
281
- return json.loads(m.group(0))
282
- except json.JSONDecodeError:
283
- pass
284
- return {"show": "none"}
285
-
286
- def _clean_response(text: str) -> str:
287
- """Strip the JSON directive block from the displayed response."""
288
- return JSON_BLOCK_RE.sub("", text).strip()
289
-
290
- def ai_chat(user_msg: str, history: list):
291
- """Chat function for the AI Dashboard tab."""
292
- if not user_msg or not user_msg.strip():
293
- return history, "", None, None
294
-
295
- idx = artifacts_index()
296
- kpis = load_kpis()
297
-
298
- if not LLM_ENABLED:
299
- reply, directive = _keyword_fallback(user_msg, idx, kpis)
300
- else:
301
- system = DASHBOARD_SYSTEM.format(
302
- artifacts_json=json.dumps(idx, indent=2),
303
- kpis_json=json.dumps(kpis, indent=2) if kpis else "(no KPIs yet, run the pipeline first)",
304
- )
305
- msgs = [{"role": "system", "content": system}]
306
- for entry in (history or [])[-6:]:
307
- msgs.append(entry)
308
- msgs.append({"role": "user", "content": user_msg})
309
-
310
- try:
311
- r = llm_client.chat_completion(
312
- model=MODEL_NAME,
313
- messages=msgs,
314
- temperature=0.3,
315
- max_tokens=600,
316
- stream=False,
317
- )
318
- raw = (
319
- r["choices"][0]["message"]["content"]
320
- if isinstance(r, dict)
321
- else r.choices[0].message.content
322
- )
323
- directive = _parse_display_directive(raw)
324
- reply = _clean_response(raw)
325
- except Exception as e:
326
- reply = f"LLM error: {e}. Falling back to keyword matching."
327
- reply_fb, directive = _keyword_fallback(user_msg, idx, kpis)
328
- reply += "\n\n" + reply_fb
329
-
330
- fig_out = None
331
- tab_out = None
332
- show = directive.get("show", "none")
333
- scope = directive.get("scope", "")
334
- fname = directive.get("filename", "")
335
-
336
- if show == "figure" and scope and fname:
337
- base = {"python": PY_FIG_DIR, "r": R_FIG_DIR}.get(scope)
338
- if base and (base / fname).exists():
339
- fig_out = str(base / fname)
340
- else:
341
- reply += f"\n\n*(Could not find figure: {scope}/{fname})*"
342
-
343
- if show == "table" and scope and fname:
344
- base = {"python": PY_TAB_DIR, "r": R_TAB_DIR}.get(scope)
345
- if base and (base / fname).exists():
346
- tab_out = _load_table_safe(base / fname)
347
- else:
348
- reply += f"\n\n*(Could not find table: {scope}/{fname})*"
349
-
350
- new_history = (history or []) + [
351
- {"role": "user", "content": user_msg},
352
- {"role": "assistant", "content": reply},
353
- ]
354
-
355
- return new_history, "", fig_out, tab_out
356
-
357
- def _keyword_fallback(msg: str, idx: Dict, kpis: Dict) -> Tuple[str, Dict]:
358
- """Simple keyword matcher when LLM is unavailable."""
359
- msg_lower = msg.lower()
360
-
361
- if not any(idx[s]["figures"] or idx[s]["tables"] for s in ("python", "r")):
362
- return (
363
- "No artifacts found yet. Please run the pipeline first (Tab 1), "
364
- "then come back here to explore the results.",
365
- {"show": "none"},
366
- )
367
-
368
- kpi_text = ""
369
- if kpis:
370
- total = kpis.get("total_units_sold", 0)
371
- kpi_text = (
372
- f"Quick summary: **{kpis.get('n_titles', '?')}** book titles across "
373
- f"**{kpis.get('n_months', '?')}** months, with **{total:,.0f}** total units sold."
374
- )
375
-
376
- if any(w in msg_lower for w in ["trend", "sales trend", "monthly sale"]):
377
- return (
378
- f"Here are the sales trends for sampled titles. {kpi_text}",
379
- {"show": "figure", "scope": "python", "filename": "sales_trends_sampled_titles.png"},
380
- )
381
-
382
- if any(w in msg_lower for w in ["sentiment", "review", "positive", "negative"]):
383
- return (
384
- f"Here is the sentiment distribution across sampled book titles. {kpi_text}",
385
- {"show": "figure", "scope": "python", "filename": "sentiment_distribution_sampled_titles.png"},
386
- )
387
-
388
- if any(w in msg_lower for w in ["arima", "forecast", "predict"]):
389
- if "compar" in msg_lower or "ets" in msg_lower or "accuracy" in msg_lower:
390
- if "forecast_compare.png" in idx.get("r", {}).get("figures", []):
391
- return (
392
- "Here is the ARIMA+Fourier vs ETS forecast comparison from the R analysis.",
393
- {"show": "figure", "scope": "r", "filename": "forecast_compare.png"},
394
- )
395
- return (
396
- f"Here are the ARIMA forecasts for sampled titles from the Python analysis. {kpi_text}",
397
- {"show": "figure", "scope": "python", "filename": "arima_forecasts_sampled_titles.png"},
398
- )
399
-
400
- if any(w in msg_lower for w in ["regression", "lm", "coefficient", "price effect", "rating effect"]):
401
- return (
402
- "The R notebook focuses on forecasting rather than regression. "
403
- "Here is the forecast accuracy comparison instead.",
404
- {"show": "table", "scope": "r", "filename": "accuracy_table.csv"},
405
- )
406
-
407
- if any(w in msg_lower for w in ["top", "best sell", "popular", "rank"]):
408
- return (
409
- f"Here are the top-selling titles by units sold. {kpi_text}",
410
- {"show": "table", "scope": "python", "filename": "top_titles_by_units_sold.csv"},
411
- )
412
-
413
- if any(w in msg_lower for w in ["accuracy", "benchmark", "rmse", "mape"]):
414
- return (
415
- "Here is the forecast accuracy comparison (ARIMA+Fourier vs ETS) from the R analysis.",
416
- {"show": "table", "scope": "r", "filename": "accuracy_table.csv"},
417
- )
418
-
419
- if any(w in msg_lower for w in ["r analysis", "r output", "r result"]):
420
- if "forecast_compare.png" in idx.get("r", {}).get("figures", []):
421
- return (
422
- "Here is the main R output: forecast model comparison plot.",
423
- {"show": "figure", "scope": "r", "filename": "forecast_compare.png"},
424
- )
425
-
426
- if any(w in msg_lower for w in ["dashboard", "overview", "summary", "kpi"]):
427
- return (
428
- f"Dashboard overview: {kpi_text}\n\nAsk me about sales trends, sentiment, forecasts, "
429
- "forecast accuracy, or top sellers to see specific visualizations.",
430
- {"show": "table", "scope": "python", "filename": "df_dashboard.csv"},
431
- )
432
-
433
- return (
434
- f"I can show you various analyses. {kpi_text}\n\n"
435
- "Try asking about: **sales trends**, **sentiment**, **ARIMA forecasts**, "
436
- "**forecast accuracy**, **top sellers**, or **dashboard overview**.",
437
- {"show": "none"},
438
- )
439
-
440
- # =========================================================
441
- # CSS LOADER (robust injection via <style> tag)
442
- # =========================================================
443
-
444
- def load_css() -> str:
445
- css_path = BASE_DIR / "style.css"
446
- if css_path.exists():
447
- return css_path.read_text(encoding="utf-8")
448
- return ""
449
-
450
- # =========================================================
451
- # UI
452
- # =========================================================
453
-
454
- ensure_dirs()
455
-
456
- css_text = load_css()
457
- with gr.Blocks(title="RX12 Workshop App", css=css_text) as demo:
458
- gr.Markdown(
459
- "# RX12 - Intro to Python and R - Workshop App\n"
460
- "*The app to integrate the three notebooks in to get a functioning blueprint of the group project's final product*",
461
- elem_id="escp_title",
462
- )
463
-
464
- # ===========================================================
465
- # TAB 1 -- Pipeline Runner
466
- # ===========================================================
467
- with gr.Tab("Pipeline Runner"):
468
- gr.Markdown("")
469
-
470
- with gr.Row():
471
- with gr.Column(scale=1):
472
- btn_nb1 = gr.Button("Step 1: Data Creation", variant="secondary")
473
- gr.Markdown("")
474
- with gr.Column(scale=1):
475
- btn_nb2 = gr.Button("Step 2a: Python Analysis", variant="secondary")
476
- gr.Markdown("")
477
- with gr.Column(scale=1):
478
- btn_r = gr.Button("Step 2b: R Analysis", variant="secondary")
479
- gr.Markdown("")
480
-
481
- with gr.Row():
482
- btn_all = gr.Button("Run All 3 Steps", variant="primary")
483
-
484
- run_log = gr.Textbox(
485
- label="Execution Log",
486
- lines=18,
487
- max_lines=30,
488
- interactive=False,
489
- )
490
-
491
- btn_nb1.click(run_datacreation, outputs=[run_log])
492
- btn_nb2.click(run_pythonanalysis, outputs=[run_log])
493
- btn_r.click(run_r, outputs=[run_log])
494
- btn_all.click(run_full_pipeline, outputs=[run_log])
495
-
496
- # ===========================================================
497
- # TAB 2 -- Results Gallery
498
- # ===========================================================
499
- with gr.Tab("Results Gallery"):
500
- gr.Markdown(
501
- "### All generated artifacts\n\n"
502
- "After running the pipeline, click **Refresh** to load all figures and tables. "
503
- "Figures are shown in the gallery; select a table from the dropdown to inspect it."
504
- )
505
-
506
- refresh_btn = gr.Button("Refresh Gallery", variant="primary")
507
-
508
- gr.Markdown("#### Figures")
509
- gallery = gr.Gallery(
510
- label="All Figures (Python + R)",
511
- columns=2,
512
- height=480,
513
- object_fit="contain",
514
- )
515
-
516
- gr.Markdown("#### Tables")
517
- table_dropdown = gr.Dropdown(
518
- label="Select a table to view",
519
- choices=[],
520
- interactive=True,
521
- )
522
- table_display = gr.Dataframe(
523
- label="Table Preview",
524
- interactive=False,
525
- )
526
-
527
- refresh_btn.click(
528
- refresh_gallery,
529
- outputs=[gallery, table_dropdown, table_display],
530
- )
531
- table_dropdown.change(
532
- on_table_select,
533
- inputs=[table_dropdown],
534
- outputs=[table_display],
535
- )
536
-
537
- # ===========================================================
538
- # TAB 3 -- AI Dashboard
539
- # ===========================================================
540
- with gr.Tab('"AI" Dashboard'):
541
- gr.Markdown(
542
- "### Ask questions, get visualisations\n\n"
543
- "Describe what you want to see and the AI will pick the right chart or table. "
544
- + (
545
- "*LLM is active.*"
546
- if LLM_ENABLED
547
- else "*No API key detected — using keyword matching. "
548
- "Set `HF_API_KEY` in Space secrets for full LLM support.*"
549
- )
550
- )
551
-
552
- with gr.Row(equal_height=True):
553
- with gr.Column(scale=1):
554
- chatbot = gr.Chatbot(label="Conversation", height=380)
555
- user_input = gr.Textbox(
556
- label="Ask about your data",
557
- placeholder="e.g. Show me sales trends / What drives revenue? / Compare forecast models",
558
- lines=1,
559
- )
560
- gr.Examples(
561
- examples=[
562
- "Show me the sales trends",
563
- "What does the sentiment look like?",
564
- "Which titles sell the most?",
565
- "Show the forecast accuracy comparison",
566
- "Compare the ARIMA and ETS forecasts",
567
- "Give me a dashboard overview",
568
- ],
569
- inputs=user_input,
570
- )
571
-
572
- with gr.Column(scale=1):
573
- ai_figure = gr.Image(label="Visualisation", height=350)
574
- ai_table = gr.Dataframe(label="Data Table", interactive=False)
575
-
576
- user_input.submit(
577
- ai_chat,
578
- inputs=[user_input, chatbot],
579
- outputs=[chatbot, user_input, ai_figure, ai_table],
580
- )
581
-
582
- demo.launch(server_name='0.0.0.0', server_port=7860, allowed_paths=[str(BASE_DIR)])