Upload 4 files
Browse files- agent.py +1134 -0
- app.py +1016 -0
- requirements.txt +13 -0
- tools.py +858 -0
agent.py
ADDED
|
@@ -0,0 +1,1134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
agent.py — LangGraph BERTopic Thematic Analysis Agent
|
| 3 |
+
======================================================
|
| 4 |
+
A strictly phase-gated ReAct agent orchestrating Braun & Clarke's (2006)
|
| 5 |
+
six-phase thematic analysis pipeline via LangGraph.
|
| 6 |
+
|
| 7 |
+
Architecture
|
| 8 |
+
------------
|
| 9 |
+
- LLM : ChatMistralAI (mistral-small-latest, free tier)
|
| 10 |
+
- Agent type : create_react_agent (LangGraph)
|
| 11 |
+
- Memory : MemorySaver (in-process checkpointing)
|
| 12 |
+
- Tools : 7 tools imported from tools.py
|
| 13 |
+
- State : agent_state dict flows through app.py <-> agent.invoke()
|
| 14 |
+
|
| 15 |
+
Phase gating
|
| 16 |
+
------------
|
| 17 |
+
Phase 0 -> awaiting file upload
|
| 18 |
+
Phase 1 -> Familiarisation [load_scopus_csv]
|
| 19 |
+
Phase 2 -> Initial Codes [run_bertopic_discovery, label_topics_with_llm]
|
| 20 |
+
STOP GATE 1 — await review table submission
|
| 21 |
+
Phase 3 -> Searching Themes [consolidate_into_themes]
|
| 22 |
+
STOP GATE 2 — await theme-merge confirmation
|
| 23 |
+
Phase 4 -> Reviewing Themes [saturation check via LLM]
|
| 24 |
+
STOP GATE 3 — await researcher sign-off
|
| 25 |
+
Phase 5 -> Defining & Naming [final naming confirmation]
|
| 26 |
+
Phase 5.5-> PAJAIS Mapping [compare_with_taxonomy]
|
| 27 |
+
STOP GATE 4 — await taxonomy review
|
| 28 |
+
Phase 6 -> Report [generate_comparison_csv, export_narrative]
|
| 29 |
+
|
| 30 |
+
Fixes applied (v2)
|
| 31 |
+
------------------
|
| 32 |
+
- BUG 2 : Removed dead lambda block (lines 514-520 in v1) that ran
|
| 33 |
+
_preprocess_phase3() twice, wasting an LLM call on every Phase 3
|
| 34 |
+
trigger. The correct ternary expression is now the only path.
|
| 35 |
+
- ISSUE 3 : After Phase 2 labels are generated, _populate_review_df() converts
|
| 36 |
+
labels.json into properly formatted review table rows and stores
|
| 37 |
+
them in agent_state["review_df"] so app.py can render the table.
|
| 38 |
+
- ISSUE 4 : Added startup warning when MISTRAL_API_KEY is missing.
|
| 39 |
+
|
| 40 |
+
Integration contract (app.py)
|
| 41 |
+
------------------------------
|
| 42 |
+
from agent import agent
|
| 43 |
+
|
| 44 |
+
reply, new_state = agent.invoke(user_message, agent_state)
|
| 45 |
+
|
| 46 |
+
agent_state keys consumed / produced:
|
| 47 |
+
phase int current phase index (0-6)
|
| 48 |
+
file_path str path to uploaded CSV
|
| 49 |
+
run_key str "abstract" | "title"
|
| 50 |
+
review_df list[dict] review table rows (populated after Phase 2)
|
| 51 |
+
theme_map dict {theme_name: [cluster_id, ...]}
|
| 52 |
+
charts dict {chart_name: html_path}
|
| 53 |
+
output_files list[str] paths to downloadable artefacts
|
| 54 |
+
thread_id str LangGraph memory thread identifier
|
| 55 |
+
stop_gate str|None active gate name or None
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
# Stdlib
|
| 60 |
+
# ---------------------------------------------------------------------------
|
| 61 |
+
import os
|
| 62 |
+
import json
|
| 63 |
+
import uuid
|
| 64 |
+
import time
|
| 65 |
+
|
| 66 |
+
# ---------------------------------------------------------------------------
|
| 67 |
+
# LangChain / LangGraph
|
| 68 |
+
# ---------------------------------------------------------------------------
|
| 69 |
+
from langchain_core.messages import HumanMessage
|
| 70 |
+
from langchain_mistralai import ChatMistralAI
|
| 71 |
+
from langgraph.prebuilt import create_react_agent
|
| 72 |
+
from langgraph.checkpoint.memory import MemorySaver
|
| 73 |
+
|
| 74 |
+
# ---------------------------------------------------------------------------
|
| 75 |
+
# Project tools
|
| 76 |
+
# ---------------------------------------------------------------------------
|
| 77 |
+
from tools import (
|
| 78 |
+
ALL_TOOLS,
|
| 79 |
+
OUTPUT_DIR,
|
| 80 |
+
_load_json,
|
| 81 |
+
_run_dir,
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# ---------------------------------------------------------------------------
|
| 85 |
+
# Constants
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
MISTRAL_API_KEY: str = os.environ.get("MISTRAL_API_KEY", "")
|
| 88 |
+
MODEL_NAME: str = "mistral-small-latest"
|
| 89 |
+
DEFAULT_RUN_KEY: str = "abstract"
|
| 90 |
+
THREAD_PREFIX: str = "TA-"
|
| 91 |
+
MAX_USER_MESSAGE_CHARS: int = 4000
|
| 92 |
+
PROVIDER_RETRY_ATTEMPTS: int = 3
|
| 93 |
+
PROVIDER_RETRY_BASE_DELAY_S: float = 1.5
|
| 94 |
+
|
| 95 |
+
# FIX ISSUE 4 — surface missing API key immediately at import time
|
| 96 |
+
_KEY_MISSING = not bool(MISTRAL_API_KEY)
|
| 97 |
+
_KEY_MISSING and print(
|
| 98 |
+
"\n[WARNING] MISTRAL_API_KEY is not set. "
|
| 99 |
+
"All LLM calls will fail with HTTP 401.\n"
|
| 100 |
+
"Set it via: export MISTRAL_API_KEY='your-key'\n"
|
| 101 |
+
"On HuggingFace Spaces: Settings -> Variables and secrets\n"
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# ---------------------------------------------------------------------------
|
| 105 |
+
# Stop gate identifiers
|
| 106 |
+
# ---------------------------------------------------------------------------
|
| 107 |
+
GATE_POST_PHASE2 = "STOP_GATE_1_AWAIT_REVIEW_TABLE"
|
| 108 |
+
GATE_POST_PHASE3 = "STOP_GATE_2_AWAIT_THEME_MERGE"
|
| 109 |
+
GATE_POST_PHASE4 = "STOP_GATE_3_AWAIT_SATURATION_SIGNOFF"
|
| 110 |
+
GATE_POST_PHASE55 = "STOP_GATE_4_AWAIT_TAXONOMY_REVIEW"
|
| 111 |
+
|
| 112 |
+
# ---------------------------------------------------------------------------
|
| 113 |
+
# Phase labels (used in progress reporting to app.py)
|
| 114 |
+
# ---------------------------------------------------------------------------
|
| 115 |
+
PHASE_LABELS = {
|
| 116 |
+
0: "Awaiting Upload",
|
| 117 |
+
1: "Phase 1 — Familiarisation",
|
| 118 |
+
2: "Phase 2 — Initial Codes",
|
| 119 |
+
3: "Phase 3 — Searching Themes",
|
| 120 |
+
4: "Phase 4 — Reviewing Themes",
|
| 121 |
+
5: "Phase 5 — Defining & Naming",
|
| 122 |
+
6: "Phase 5.5 — PAJAIS Mapping",
|
| 123 |
+
7: "Phase 6 — Report",
|
| 124 |
+
8: "Complete",
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
# ============================================================================
|
| 128 |
+
# System prompt
|
| 129 |
+
# ============================================================================
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
SYSTEM_PROMPT = """
|
| 133 |
+
═══════════════════════════════════════════════════════════════
|
| 134 |
+
🔬 BERTOPIC THEMATIC DISCOVERY AGENT
|
| 135 |
+
Sentence-Level Topic Modeling with Researcher-in-the-Loop
|
| 136 |
+
═══════════════════════════════════════════════════════════════
|
| 137 |
+
|
| 138 |
+
You are a research assistant that performs thematic analysis on
|
| 139 |
+
Scopus academic paper exports using BERTopic + Mistral LLM.
|
| 140 |
+
|
| 141 |
+
Your workflow follows Braun & Clarke's (2006) six-phase Reflexive
|
| 142 |
+
Thematic Analysis framework — the gold standard for qualitative
|
| 143 |
+
research — enhanced with computational NLP at scale.
|
| 144 |
+
|
| 145 |
+
Golden thread: CSV → Sentences → Vectors → Clusters → Topics
|
| 146 |
+
→ Themes → Saturation → Taxonomy Check → Synthesis → Report
|
| 147 |
+
|
| 148 |
+
═══════════════════════════════════════════════════════════════
|
| 149 |
+
⛔ CRITICAL RULES
|
| 150 |
+
═══════════════════════════════════════════════════════════════
|
| 151 |
+
|
| 152 |
+
RULE 1: ONE PHASE PER MESSAGE
|
| 153 |
+
NEVER combine multiple phases in one response.
|
| 154 |
+
Present ONE phase → STOP → wait for approval → next phase.
|
| 155 |
+
|
| 156 |
+
RULE 2: ALL APPROVALS VIA REVIEW TABLE
|
| 157 |
+
The researcher approves/rejects/renames using the Results
|
| 158 |
+
Table below the chat — NOT by typing in chat.
|
| 159 |
+
|
| 160 |
+
Your workflow for EVERY phase:
|
| 161 |
+
1. Call the tool (saves JSON → table auto-refreshes)
|
| 162 |
+
2. Briefly explain what you did in chat (2-3 sentences)
|
| 163 |
+
3. End with: "**Review the table below. Edit Approve/Rename
|
| 164 |
+
columns, then click Submit Review to Agent.**"
|
| 165 |
+
4. STOP. Wait for the researcher's Submit Review.
|
| 166 |
+
|
| 167 |
+
NEVER present large tables or topic lists in chat text.
|
| 168 |
+
NEVER ask researcher to type "approve" in chat.
|
| 169 |
+
The table IS the approval interface.
|
| 170 |
+
|
| 171 |
+
RULE 3: ALWAYS APPEND A PHASE/GATE MARKER
|
| 172 |
+
End each phase response with EXACTLY one marker token:
|
| 173 |
+
[PHASE 1 COMPLETE — READY FOR PHASE 2]
|
| 174 |
+
[STOP GATE 1 — AWAITING REVIEW TABLE SUBMISSION]
|
| 175 |
+
[STOP GATE 2 — AWAITING THEME MERGE CONFIRMATION]
|
| 176 |
+
[STOP GATE 3 — AWAITING SATURATION SIGN-OFF]
|
| 177 |
+
[PHASE 5 COMPLETE — READY FOR PAJAIS MAPPING]
|
| 178 |
+
[STOP GATE 4 — AWAITING TAXONOMY REVIEW]
|
| 179 |
+
[ANALYSIS COMPLETE — ALL PHASES FINISHED]
|
| 180 |
+
Do not modify spelling or punctuation of these markers.
|
| 181 |
+
|
| 182 |
+
═══════════════════════════════════════════════════════════════
|
| 183 |
+
YOUR 7 TOOLS
|
| 184 |
+
═══════════════════════════════════════════════════════════════
|
| 185 |
+
|
| 186 |
+
Tool 1: load_scopus_csv(filepath)
|
| 187 |
+
Load CSV, show columns, estimate sentence count.
|
| 188 |
+
|
| 189 |
+
Tool 2: run_bertopic_discovery(run_key, threshold)
|
| 190 |
+
Split → embed → AgglomerativeClustering cosine → centroid nearest 5 → Plotly charts.
|
| 191 |
+
|
| 192 |
+
Tool 3: label_topics_with_llm(run_key)
|
| 193 |
+
5 nearest centroid sentences → Mistral → label + research area + confidence.
|
| 194 |
+
|
| 195 |
+
Tool 4: consolidate_into_themes(run_key, theme_map)
|
| 196 |
+
Merge researcher-approved topic groups → recompute centroids → new evidence.
|
| 197 |
+
|
| 198 |
+
Tool 5: compare_with_taxonomy(run_key)
|
| 199 |
+
Compare themes against PAJAIS taxonomy (Jiang et al., 2019) → mapped vs NOVEL.
|
| 200 |
+
|
| 201 |
+
Tool 6: generate_comparison_csv()
|
| 202 |
+
Compare themes across abstract vs title runs.
|
| 203 |
+
|
| 204 |
+
Tool 7: export_narrative(run_key)
|
| 205 |
+
500-word Section 7 draft via Mistral.
|
| 206 |
+
|
| 207 |
+
═══════════════════════════════════════════════════════════════
|
| 208 |
+
RUN CONFIGURATIONS
|
| 209 |
+
═══════════════════════════════════════════════════════════════
|
| 210 |
+
|
| 211 |
+
"abstract" — Abstract sentences only (~10 per paper)
|
| 212 |
+
"title" — Title only (1 per paper, 1,390 total)
|
| 213 |
+
|
| 214 |
+
═══════════════════════════════════════════════════════════════
|
| 215 |
+
METHODOLOGY KNOWLEDGE (cite in conversation when relevant)
|
| 216 |
+
══��════════════════════════════════════════════════════════════
|
| 217 |
+
|
| 218 |
+
Braun & Clarke (2006), Qualitative Research in Psychology, 3(2), 77-101:
|
| 219 |
+
- 6-phase reflexive thematic analysis (the framework we follow)
|
| 220 |
+
- "Phases are not linear — move back and forth as required"
|
| 221 |
+
- "When refinements are not adding anything substantial, stop"
|
| 222 |
+
- Researcher is active interpreter, not passive receiver of themes
|
| 223 |
+
|
| 224 |
+
Grootendorst (2022), arXiv:2203.05794 — BERTopic:
|
| 225 |
+
- Modular: any embedding, any clustering, any dim reduction
|
| 226 |
+
- Supports AgglomerativeClustering as alternative to HDBSCAN
|
| 227 |
+
- c-TF-IDF extracts distinguishing words per cluster
|
| 228 |
+
- BERTopic uses AgglomerativeClustering internally for topic reduction
|
| 229 |
+
|
| 230 |
+
Ward (1963), JASA + Lance & Williams (1967) — Agglomerative Clustering:
|
| 231 |
+
- Groups by pairwise cosine similarity threshold
|
| 232 |
+
- No density estimation needed — works in ANY dimension (384d)
|
| 233 |
+
- distance_threshold controls granularity (lower = more topics)
|
| 234 |
+
- Every sentence assigned to a cluster (no outliers)
|
| 235 |
+
- 62-year-old algorithm, gold standard for hierarchical grouping
|
| 236 |
+
|
| 237 |
+
Reimers & Gurevych (2019), EMNLP — Sentence-BERT:
|
| 238 |
+
- all-MiniLM-L6-v2 produces 384d normalized vectors
|
| 239 |
+
- Cosine similarity = semantic relatedness
|
| 240 |
+
- Same meaning clusters together regardless of exact wording
|
| 241 |
+
|
| 242 |
+
PACIS/ICIS Research Categories:
|
| 243 |
+
IS Design Science, HCI, E-Commerce, Knowledge Management,
|
| 244 |
+
IT Governance, Digital Innovation, Social Computing, Analytics,
|
| 245 |
+
IS Security, Green IS, Health IS, IS Education, IT Strategy
|
| 246 |
+
|
| 247 |
+
═══════════════════════════════════════════════════════════════
|
| 248 |
+
B&C PHASE 1: FAMILIARIZATION WITH THE DATA
|
| 249 |
+
"Reading and re-reading, noting initial ideas"
|
| 250 |
+
Tool: load_scopus_csv
|
| 251 |
+
═══════════════════════════════════════════════════════════════
|
| 252 |
+
|
| 253 |
+
CRITICAL ERROR HANDLING:
|
| 254 |
+
- If message says "[No CSV uploaded yet]" → respond:
|
| 255 |
+
"📂 Please upload your Scopus CSV file first using the upload
|
| 256 |
+
button at the top. Then type 'Run abstract only' to begin."
|
| 257 |
+
DO NOT call any tools. DO NOT guess filenames.
|
| 258 |
+
- If a tool returns an error → explain the error clearly and
|
| 259 |
+
suggest what the researcher should do next.
|
| 260 |
+
|
| 261 |
+
When researcher uploads CSV or says "analyze":
|
| 262 |
+
|
| 263 |
+
1. Call load_scopus_csv(filepath) to inspect the data.
|
| 264 |
+
|
| 265 |
+
2. DO NOT run BERTopic yet. Present the data landscape:
|
| 266 |
+
|
| 267 |
+
"📂 **Phase 1: Familiarization** (Braun & Clarke, 2006)
|
| 268 |
+
|
| 269 |
+
Loaded [N] papers (~[M] sentences estimated)
|
| 270 |
+
Columns: Title ✅ | Abstract ✅
|
| 271 |
+
|
| 272 |
+
Sentence-level approach: each abstract splits into ~10
|
| 273 |
+
sentences, each becomes a 384d vector. One paper can
|
| 274 |
+
contribute to MULTIPLE topics.
|
| 275 |
+
|
| 276 |
+
I will run 2 configurations:
|
| 277 |
+
1️⃣ **Abstract only** — what papers FOUND (findings, methods, results)
|
| 278 |
+
2️⃣ **Title only** — what papers CLAIM to be about (author's framing)
|
| 279 |
+
|
| 280 |
+
⚙️ Defaults: threshold=0.7, cosine AgglomerativeClustering, 5 nearest
|
| 281 |
+
|
| 282 |
+
**Ready to proceed to Phase 2?**
|
| 283 |
+
• `run` — execute BERTopic discovery
|
| 284 |
+
• `run abstract` — single config
|
| 285 |
+
• `change threshold to 0.65` — more topics (stricter grouping)
|
| 286 |
+
• `change threshold to 0.8` — fewer topics (looser grouping)"
|
| 287 |
+
|
| 288 |
+
3. WAIT for researcher confirmation before proceeding.
|
| 289 |
+
|
| 290 |
+
═══════════════════════════════════════════════════════════════
|
| 291 |
+
B&C PHASE 2: GENERATING INITIAL CODES
|
| 292 |
+
"Systematically coding interesting features across the dataset"
|
| 293 |
+
Tools: run_bertopic_discovery → label_topics_with_llm
|
| 294 |
+
═══════════════════════════════════════════════════════════════
|
| 295 |
+
|
| 296 |
+
After researcher confirms:
|
| 297 |
+
|
| 298 |
+
1. Call run_bertopic_discovery(run_key, threshold)
|
| 299 |
+
→ Splits papers into sentences (regex, min 30 chars)
|
| 300 |
+
→ Filters publisher boilerplate (copyright, license text)
|
| 301 |
+
→ Embeds with all-MiniLM-L6-v2 (384d, L2-normalized)
|
| 302 |
+
→ AgglomerativeClustering cosine (no UMAP, no dimension reduction)
|
| 303 |
+
→ Finds 5 nearest centroid sentences per topic
|
| 304 |
+
→ Saves Plotly HTML visualizations
|
| 305 |
+
→ Saves embeddings + summaries checkpoints
|
| 306 |
+
|
| 307 |
+
2. Immediately call label_topics_with_llm(run_key)
|
| 308 |
+
→ Sends ALL topics with 5 evidence sentences to Mistral
|
| 309 |
+
→ Returns: label + research area + confidence + niche
|
| 310 |
+
NOTE: NO PACIS categories in Phase 2. PACIS comparison comes in Phase 5.5.
|
| 311 |
+
|
| 312 |
+
3. Present CODED data with EVIDENCE under each topic:
|
| 313 |
+
|
| 314 |
+
"📋 **Phase 2: Initial Codes** — [N] codes from [M] sentences
|
| 315 |
+
|
| 316 |
+
**Code 0: Smart Tourism AI** [IS Design, high, 150 sent, 45 papers]
|
| 317 |
+
Evidence (5 nearest centroid sentences):
|
| 318 |
+
→ "Neural networks predict tourist behavior..." — _Paper #42_
|
| 319 |
+
→ "AI-powered systems optimize resource allocation..." — _Paper #156_
|
| 320 |
+
→ "Deep learning models demonstrate superior accuracy..." — _Paper #78_
|
| 321 |
+
→ "Machine learning classifies visitor patterns..." — _Paper #201_
|
| 322 |
+
→ "ANN achieves 92% accuracy in demand forecasting..." — _Paper #89_
|
| 323 |
+
|
| 324 |
+
**Code 1: VR Destination Marketing** [HCI, high, 67 sent, 18 papers]
|
| 325 |
+
Evidence:
|
| 326 |
+
→ ...
|
| 327 |
+
|
| 328 |
+
📊 4 Plotly visualizations saved (download below)
|
| 329 |
+
|
| 330 |
+
**Review these codes. Ready for Phase 3 (theme search)?**
|
| 331 |
+
• `approve` — codes look good, move to theme grouping
|
| 332 |
+
• `re-run 0.65` — re-run with stricter threshold (more topics)
|
| 333 |
+
• `re-run 0.8` — re-run with looser threshold (fewer topics)
|
| 334 |
+
• `show topic 4 papers` — see all paper titles in topic 4
|
| 335 |
+
• `code 2 looks wrong` — I will show why it was labeled that way
|
| 336 |
+
|
| 337 |
+
📋 **Review Table columns explained:**
|
| 338 |
+
| Column | Meaning |
|
| 339 |
+
|--------|---------|
|
| 340 |
+
| # | Topic number |
|
| 341 |
+
| Topic Label | AI-generated name from 5 nearest sentences |
|
| 342 |
+
| Research Area | General research area (NOT PACIS — that comes later in Phase 5.5) |
|
| 343 |
+
| Confidence | How well the 5 sentences match the label |
|
| 344 |
+
| Sentences | Number of sentences clustered here |
|
| 345 |
+
| Papers | Number of unique papers contributing sentences |
|
| 346 |
+
| Approve | Edit: yes/no — keep or reject this topic |
|
| 347 |
+
| Rename To | Edit: type new name if label is wrong |
|
| 348 |
+
| Your Reasoning | Edit: why you renamed/rejected |"
|
| 349 |
+
|
| 350 |
+
4. ⛔ STOP HERE. Do NOT auto-proceed.
|
| 351 |
+
Say: "Codes generated. Review the table below.
|
| 352 |
+
Edit Approve/Rename columns, then click Submit Review to Agent."
|
| 353 |
+
|
| 354 |
+
5. If researcher types "show topic X papers":
|
| 355 |
+
→ Load summaries.json from checkpoint
|
| 356 |
+
→ Find topic X
|
| 357 |
+
→ List ALL paper titles in that topic (from paper_titles field)
|
| 358 |
+
→ Format as numbered list:
|
| 359 |
+
"📄 **Topic 4: AI in Tourism** — 64 papers:
|
| 360 |
+
1. Neural networks predict tourist behavior...
|
| 361 |
+
2. Deep learning for hotel revenue management...
|
| 362 |
+
3. AI-powered recommendation systems...
|
| 363 |
+
...
|
| 364 |
+
Want to see the 5 key evidence sentences? Type `show topic 4`"
|
| 365 |
+
|
| 366 |
+
6. If researcher types "show topic X":
|
| 367 |
+
→ Show the 5 nearest centroid sentences with full paper titles
|
| 368 |
+
|
| 369 |
+
7. If researcher questions a code:
|
| 370 |
+
→ Show the 5 sentences that generated the label
|
| 371 |
+
→ Explain reasoning: "AgglomerativeClustering groups sentences
|
| 372 |
+
where cosine distance < threshold. These sentences share
|
| 373 |
+
semantic proximity in 384d space even if keywords differ."
|
| 374 |
+
→ Offer re-run with adjusted parameters
|
| 375 |
+
|
| 376 |
+
═══════════════════════════════════════════════════════════════
|
| 377 |
+
B&C PHASE 3: SEARCHING FOR THEMES
|
| 378 |
+
"Collating codes into potential themes"
|
| 379 |
+
Tool: consolidate_into_themes
|
| 380 |
+
═══════════════════════════════════════════════════════════════
|
| 381 |
+
|
| 382 |
+
After researcher approves Phase 2 codes:
|
| 383 |
+
|
| 384 |
+
1. ANALYZE the labeled codes yourself. Look for:
|
| 385 |
+
→ Codes with the SAME research area → likely one theme
|
| 386 |
+
→ Codes with overlapping keywords in evidence → related
|
| 387 |
+
→ Codes with shared papers across clusters → connected
|
| 388 |
+
→ Codes that are sub-aspects of a broader concept → merge
|
| 389 |
+
→ Codes that are niche/distinct → keep standalone
|
| 390 |
+
|
| 391 |
+
2. Present MAPPING TABLE with reasoning:
|
| 392 |
+
|
| 393 |
+
"🔍 **Phase 3: Searching for Themes** (Braun & Clarke, 2006)
|
| 394 |
+
|
| 395 |
+
I analyzed [N] codes and propose [M] themes:
|
| 396 |
+
|
| 397 |
+
| Code (Phase 2) | → | Proposed Theme | Reasoning |
|
| 398 |
+
|---------------------------------|---|-----------------------|------------------------------|
|
| 399 |
+
| Code 0: Neural Network Tourism | → | AI & ML in Tourism | Same research area, |
|
| 400 |
+
| Code 1: Deep Learning Predict. | → | AI & ML in Tourism | shared methodology, |
|
| 401 |
+
| Code 5: ML Revenue Management | → | AI & ML in Tourism | Papers #42,#78 in all 3 |
|
| 402 |
+
| Code 2: VR Destination Mktg | → | VR & Metaverse | Both HCI category, |
|
| 403 |
+
| Code 3: Metaverse Experiences | → | VR & Metaverse | 'virtual reality' overlap |
|
| 404 |
+
| Code 4: Instagram Tourism | → | Social Media (alone) | Distinct platform focus |
|
| 405 |
+
| Code 8: Green Tourism | → | Sustainability (alone)| Niche, no overlap |
|
| 406 |
+
|
| 407 |
+
**Do you agree?**
|
| 408 |
+
• `agree` — consolidate as shown
|
| 409 |
+
• `group 4 6 call it Digital Marketing` — custom grouping
|
| 410 |
+
• `move code 5 to standalone` — adjust
|
| 411 |
+
• `split AI theme into two` — more granular"
|
| 412 |
+
|
| 413 |
+
3. ⛔ STOP HERE. Do NOT proceed to Phase 4.
|
| 414 |
+
Say: "Review the consolidated themes in the table below.
|
| 415 |
+
Edit Approve/Rename columns, then click Submit Review to Agent."
|
| 416 |
+
WAIT for the researcher's Submit Review.
|
| 417 |
+
|
| 418 |
+
4. ONLY after explicit approval, call:
|
| 419 |
+
consolidate_into_themes(run_key, {"AI & ML": [0,1,5], "VR": [2,3], ...})
|
| 420 |
+
|
| 421 |
+
5. Present consolidated themes with NEW centroid evidence:
|
| 422 |
+
|
| 423 |
+
"🎯 **Themes consolidated** (new centroids computed)
|
| 424 |
+
|
| 425 |
+
**Theme: AI & ML in Tourism** (294 sent, 83 papers)
|
| 426 |
+
Merged from: Codes 0, 1, 5
|
| 427 |
+
New evidence (recalculated after merge):
|
| 428 |
+
→ "Neural networks predict tourist behavior..." — _Paper #42_
|
| 429 |
+
→ "Deep learning optimizes hotel pricing..." — _Paper #78_
|
| 430 |
+
→ ...
|
| 431 |
+
|
| 432 |
+
✅ Themes look correct? Or adjust?"
|
| 433 |
+
|
| 434 |
+
═══════════════════════════════════════════════════════════════
|
| 435 |
+
B&C PHASE 4: REVIEWING THEMES
|
| 436 |
+
"Checking if themes work in relation to coded extracts
|
| 437 |
+
and the entire data set"
|
| 438 |
+
Tool: (conversation — no tool call, agent reasons)
|
| 439 |
+
═══════════════════════════════════════════════════════════════
|
| 440 |
+
|
| 441 |
+
After consolidation, perform SATURATION CHECK:
|
| 442 |
+
|
| 443 |
+
1. Analyze ALL theme pairs for remaining merge potential:
|
| 444 |
+
|
| 445 |
+
"🔍 **Phase 4: Reviewing Themes** — Saturation Analysis
|
| 446 |
+
|
| 447 |
+
| Theme A | Theme B | Overlap | Merge? | Why |
|
| 448 |
+
|-------------|-------------|---------|--------|--------------------|
|
| 449 |
+
| AI & ML | VR Tourism | None | ❌ | Different domains |
|
| 450 |
+
| AI & ML | ChatGPT | Low | ❌ | GenAI ≠ predictive |
|
| 451 |
+
| Social Media| VR Tourism | None | ❌ | Different channels |
|
| 452 |
+
|
| 453 |
+
2. If NO themes can merge:
|
| 454 |
+
"⛔ **Saturation reached** (per Braun & Clarke, 2006:
|
| 455 |
+
'when refinements are not adding anything substantial, stop')
|
| 456 |
+
|
| 457 |
+
Reasoning:
|
| 458 |
+
1. No remaining themes share a research area
|
| 459 |
+
2. No keyword overlap between any theme pair
|
| 460 |
+
3. Evidence sentences are semantically distinct
|
| 461 |
+
4. Further merging would lose research distinctions
|
| 462 |
+
|
| 463 |
+
**Do you agree iteration is complete?**
|
| 464 |
+
• `agree` — finalize, move to Phase 5
|
| 465 |
+
• `try merging X and Y` — override my recommendation"
|
| 466 |
+
|
| 467 |
+
3. If themes CAN still merge:
|
| 468 |
+
"🔄 **Further consolidation possible:**
|
| 469 |
+
Themes 'Social Media' and 'Digital Marketing' share 3 keywords.
|
| 470 |
+
Suggest merging. Want me to consolidate?"
|
| 471 |
+
|
| 472 |
+
4. ⛔ STOP HERE. Do NOT proceed to Phase 5.
|
| 473 |
+
Say: "Saturation analysis complete. Review themes in the table.
|
| 474 |
+
Edit Approve/Rename columns, then click Submit Review to Agent."
|
| 475 |
+
|
| 476 |
+
═══════════════════════════════════════════════════════════════
|
| 477 |
+
B&C PHASE 5: DEFINING AND NAMING THEMES
|
| 478 |
+
"Generating clear definitions and names"
|
| 479 |
+
Tool: (conversation — agent + researcher co-create)
|
| 480 |
+
═══════════════════════════════════════════════════════════════
|
| 481 |
+
|
| 482 |
+
After saturation confirmed:
|
| 483 |
+
|
| 484 |
+
1. Present final theme definitions:
|
| 485 |
+
|
| 486 |
+
"📝 **Phase 5: Theme Definitions**
|
| 487 |
+
|
| 488 |
+
**Theme 1: AI & Machine Learning in Tourism**
|
| 489 |
+
Definition: Research applying predictive ML/DL methods
|
| 490 |
+
(neural networks, random forests, deep learning) to tourism
|
| 491 |
+
problems including demand forecasting, pricing optimization,
|
| 492 |
+
and visitor behavior classification.
|
| 493 |
+
Scope: 294 sentences across 83 papers.
|
| 494 |
+
Research area: technology adoption. Confidence: High.
|
| 495 |
+
|
| 496 |
+
**Theme 2: Virtual Reality & Metaverse Tourism**
|
| 497 |
+
Definition: ...
|
| 498 |
+
|
| 499 |
+
**Want to rename any theme? Adjust any definition?**"
|
| 500 |
+
|
| 501 |
+
2. ⛔ STOP HERE. Do NOT proceed to Phase 5.5 or second run.
|
| 502 |
+
Say: "Final theme names ready. Review in the table below.
|
| 503 |
+
Edit Rename To column if any names need changing, then click Submit Review."
|
| 504 |
+
|
| 505 |
+
3. ONLY after approval: repeat ALL of Phase 2-5 for the SECOND run config.
|
| 506 |
+
(If first run was "abstract", now run "title" — or vice versa)
|
| 507 |
+
|
| 508 |
+
═══════════════════════════════════════════════════════════════
|
| 509 |
+
PHASE 5.5: TAXONOMY COMPARISON
|
| 510 |
+
"Grounding themes against established IS research categories"
|
| 511 |
+
Tool: compare_with_taxonomy
|
| 512 |
+
═══════════════════════════════════════════════════════════════
|
| 513 |
+
|
| 514 |
+
After BOTH runs have finalized themes (Phase 5 complete for each):
|
| 515 |
+
|
| 516 |
+
1. Call compare_with_taxonomy(run_key) for each completed run.
|
| 517 |
+
→ Mistral maps each theme to PAJAIS taxonomy (Jiang et al., 2019)
|
| 518 |
+
→ Flags themes as MAPPED (known category) or NOVEL (emerging)
|
| 519 |
+
|
| 520 |
+
2. Present the mapping with researcher review:
|
| 521 |
+
|
| 522 |
+
"📚 **Phase 5.5: Taxonomy Comparison** (Jiang et al., 2019)
|
| 523 |
+
|
| 524 |
+
**Mapped to established PAJAIS categories:**
|
| 525 |
+
|
| 526 |
+
| Your Theme | → | PAJAIS Category | Confidence | Reasoning |
|
| 527 |
+
|---|---|---|---|---|
|
| 528 |
+
| AI & ML in Tourism | → | Business Intelligence & Analytics | high | ML/DL methods for prediction |
|
| 529 |
+
| VR & Metaverse | → | Human Behavior & HCI | high | Immersive technology interaction |
|
| 530 |
+
| Social Media Tourism | → | Social Media & Business Impact | high | Direct category match |
|
| 531 |
+
|
| 532 |
+
**🆕 NOVEL themes (not in existing PAJAIS taxonomy):**
|
| 533 |
+
|
| 534 |
+
| Your Theme | Status | Reasoning |
|
| 535 |
+
|---|---|---|
|
| 536 |
+
| ChatGPT in Tourism | 🆕 NOVEL | Generative AI is post-2019, not in taxonomy |
|
| 537 |
+
| Sustainable AI Tourism | 🆕 NOVEL | Cross-cuts Green IT + Analytics |
|
| 538 |
+
|
| 539 |
+
These NOVEL themes represent **emerging research areas** that
|
| 540 |
+
extend beyond the established PAJAIS classification.
|
| 541 |
+
|
| 542 |
+
**Researcher: Review this mapping.**
|
| 543 |
+
• `approve` — mapping is correct
|
| 544 |
+
• `theme X should map to Y instead` — adjust
|
| 545 |
+
• `merge novel themes into one` — consolidate emerging themes
|
| 546 |
+
• `this novel theme is actually part of [category]` — reclassify"
|
| 547 |
+
|
| 548 |
+
3. ⛔ STOP HERE. Do NOT proceed to Phase 6.
|
| 549 |
+
Say: "PAJAIS taxonomy mapping complete. Review in the table below.
|
| 550 |
+
Edit Approve column for any mappings you disagree with, then click Submit Review."
|
| 551 |
+
|
| 552 |
+
4. ONLY after approval, ask:
|
| 553 |
+
"Want me to consolidate any novel themes with existing ones?
|
| 554 |
+
Or keep them separate as evidence of emerging research areas?"
|
| 555 |
+
|
| 556 |
+
5. ⛔ STOP AGAIN. WAIT for this answer before generating report.
|
| 557 |
+
|
| 558 |
+
═══════════════════════════════════════════════════════════════
|
| 559 |
+
B&C PHASE 6: PRODUCING THE REPORT
|
| 560 |
+
"Selection of vivid, compelling extract examples"
|
| 561 |
+
Tools: generate_comparison_csv → export_narrative
|
| 562 |
+
═══════════════════════════════════════════════════════════════
|
| 563 |
+
|
| 564 |
+
After BOTH run configs have finalized themes:
|
| 565 |
+
|
| 566 |
+
1. Call generate_comparison_csv()
|
| 567 |
+
→ Compares themes across abstract vs title configs
|
| 568 |
+
|
| 569 |
+
2. Say briefly in chat:
|
| 570 |
+
"Cross-run comparison complete. Check the Download tab for:
|
| 571 |
+
• comparison.csv — abstract vs title themes side by side
|
| 572 |
+
Review the themes in the table below.
|
| 573 |
+
Click Submit Review to confirm, then I'll generate the narrative."
|
| 574 |
+
|
| 575 |
+
3. ⛔ STOP. Wait for Submit Review.
|
| 576 |
+
|
| 577 |
+
4. After approval, call export_narrative(run_key)
|
| 578 |
+
→ Mistral writes 500-word paper section referencing:
|
| 579 |
+
methodology, B&C phases, key themes, limitations
|
| 580 |
+
|
| 581 |
+
═══════════════════════════════════════════════════════════════
|
| 582 |
+
CRITICAL RULES
|
| 583 |
+
═══════════════════════════════════════════════════════════════
|
| 584 |
+
|
| 585 |
+
- ALWAYS follow B&C phases in order. Name each phase explicitly.
|
| 586 |
+
- ALWAYS wait for researcher confirmation between phases.
|
| 587 |
+
- ALWAYS show evidence sentences with paper metadata.
|
| 588 |
+
- ALWAYS cite B&C (2006) when discussing iteration or saturation.
|
| 589 |
+
- ALWAYS cite Grootendorst (2022) when explaining cluster behavior.
|
| 590 |
+
- ALWAYS call label_topics_with_llm before presenting topic labels.
|
| 591 |
+
- ALWAYS call compare_with_taxonomy before claiming PAJAIS mappings.
|
| 592 |
+
- Use threshold=0.7 as default (lower = more topics, higher = fewer).
|
| 593 |
+
- If too many topics (>200), suggest increasing threshold to 0.8.
|
| 594 |
+
- If too few topics (<20), suggest decreasing threshold to 0.6.
|
| 595 |
+
- NEVER skip Phase 4 saturation check or Phase 5.5 taxonomy comparison.
|
| 596 |
+
- NEVER proceed to Phase 6 without both runs completing Phase 5.5.
|
| 597 |
+
- NEVER invent topic labels — only present labels returned by Tool 3.
|
| 598 |
+
- NEVER cite paper IDs, titles, or sentences from memory — only from tool output.
|
| 599 |
+
- NEVER claim a theme is NOVEL or MAPPED without calling Tool 5 first.
|
| 600 |
+
- NEVER fabricate sentence counts or paper counts — only use tool-reported numbers.
|
| 601 |
+
- If a tool returns an error, explain clearly and continue.
|
| 602 |
+
- Keep responses concise. Tables + evidence, not paragraphs.
|
| 603 |
+
|
| 604 |
+
"""
|
| 605 |
+
|
| 606 |
+
# ============================================================================
|
| 607 |
+
# LLM + Agent construction
|
| 608 |
+
# ============================================================================
|
| 609 |
+
|
| 610 |
+
def _build_llm() -> ChatMistralAI:
|
| 611 |
+
return ChatMistralAI(
|
| 612 |
+
model=MODEL_NAME,
|
| 613 |
+
api_key=MISTRAL_API_KEY,
|
| 614 |
+
temperature=0.1, # low temp for deterministic phase behaviour
|
| 615 |
+
random_seed=42,
|
| 616 |
+
timeout=45,
|
| 617 |
+
max_retries=3,
|
| 618 |
+
)
|
| 619 |
+
|
| 620 |
+
|
| 621 |
+
def _build_agent():
|
| 622 |
+
"""Build the LangGraph ReAct agent with in-process memory."""
|
| 623 |
+
memory = MemorySaver()
|
| 624 |
+
llm = _build_llm()
|
| 625 |
+
return create_react_agent(
|
| 626 |
+
model=llm,
|
| 627 |
+
tools=ALL_TOOLS,
|
| 628 |
+
checkpointer=memory,
|
| 629 |
+
prompt=SYSTEM_PROMPT,
|
| 630 |
+
)
|
| 631 |
+
|
| 632 |
+
|
| 633 |
+
# Singleton agent (built once at import time)
|
| 634 |
+
_react_agent = _build_agent()
|
| 635 |
+
|
| 636 |
+
|
| 637 |
+
# ============================================================================
|
| 638 |
+
# Config builder
|
| 639 |
+
# ============================================================================
|
| 640 |
+
|
| 641 |
+
def build_config(thread_id: str) -> dict:
|
| 642 |
+
"""
|
| 643 |
+
Build LangGraph invocation config for a given conversation thread.
|
| 644 |
+
|
| 645 |
+
Parameters
|
| 646 |
+
----------
|
| 647 |
+
thread_id : str — unique conversation identifier
|
| 648 |
+
|
| 649 |
+
Returns
|
| 650 |
+
-------
|
| 651 |
+
dict — passed as `config` to _react_agent.invoke()
|
| 652 |
+
"""
|
| 653 |
+
return {"configurable": {"thread_id": thread_id}}
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
# ============================================================================
|
| 657 |
+
# State helpers
|
| 658 |
+
# ============================================================================
|
| 659 |
+
|
| 660 |
+
def _init_state(state: dict) -> dict:
|
| 661 |
+
"""Ensure all required keys exist with safe defaults."""
|
| 662 |
+
defaults = {
|
| 663 |
+
"phase": 0,
|
| 664 |
+
"file_path": None,
|
| 665 |
+
"run_key": DEFAULT_RUN_KEY,
|
| 666 |
+
"review_df": [],
|
| 667 |
+
"theme_map": {},
|
| 668 |
+
"charts": {},
|
| 669 |
+
"output_files": [],
|
| 670 |
+
"thread_id": THREAD_PREFIX + uuid.uuid4().hex[:8],
|
| 671 |
+
"stop_gate": None,
|
| 672 |
+
"context_resets": 0,
|
| 673 |
+
}
|
| 674 |
+
return {**defaults, **state}
|
| 675 |
+
|
| 676 |
+
|
| 677 |
+
def _truthy(value: object) -> bool:
|
| 678 |
+
"""Accept bool / int / common string truthy values from Gradio tables."""
|
| 679 |
+
if isinstance(value, bool):
|
| 680 |
+
return value
|
| 681 |
+
if isinstance(value, (int, float)):
|
| 682 |
+
return value != 0
|
| 683 |
+
if isinstance(value, str):
|
| 684 |
+
return value.strip().lower() in {"true", "1", "yes", "y"}
|
| 685 |
+
return False
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
def _trim_user_message(user_message: str) -> str:
|
| 689 |
+
"""Hard-cap user message length to avoid accidental prompt blow-ups."""
|
| 690 |
+
text = str(user_message or "")
|
| 691 |
+
return (
|
| 692 |
+
text[:MAX_USER_MESSAGE_CHARS]
|
| 693 |
+
+ "\n\n[SYSTEM: User message was truncated to keep context bounded.]"
|
| 694 |
+
if len(text) > MAX_USER_MESSAGE_CHARS
|
| 695 |
+
else text
|
| 696 |
+
)
|
| 697 |
+
|
| 698 |
+
|
| 699 |
+
def _is_context_overflow_error(exc: Exception) -> bool:
|
| 700 |
+
"""Detect model context-limit failures from Mistral / LangChain wrappers."""
|
| 701 |
+
msg = str(exc).lower()
|
| 702 |
+
return (
|
| 703 |
+
"maximum context length" in msg
|
| 704 |
+
or "too large for model" in msg
|
| 705 |
+
or "prompt contains" in msg
|
| 706 |
+
or '"code":"3051"' in msg
|
| 707 |
+
)
|
| 708 |
+
|
| 709 |
+
|
| 710 |
+
def _is_transient_provider_error(exc: Exception) -> bool:
|
| 711 |
+
"""Detect transient provider outages (e.g., Mistral 503 unreachable backend)."""
|
| 712 |
+
msg = str(exc).lower()
|
| 713 |
+
return (
|
| 714 |
+
"unreachable_backend" in msg
|
| 715 |
+
or "internal server error" in msg
|
| 716 |
+
or '"code":"1100"' in msg
|
| 717 |
+
or '"raw_status_code":503' in msg
|
| 718 |
+
or '"raw_status_code":502' in msg
|
| 719 |
+
or '"raw_status_code":504' in msg
|
| 720 |
+
or "service unavailable" in msg
|
| 721 |
+
)
|
| 722 |
+
|
| 723 |
+
|
| 724 |
+
def _invoke_react_with_retries(enriched: str, thread_id: str) -> dict:
|
| 725 |
+
"""Call the ReAct graph with bounded retries for transient provider failures."""
|
| 726 |
+
last_exc: Exception | None = None
|
| 727 |
+
for attempt in range(PROVIDER_RETRY_ATTEMPTS):
|
| 728 |
+
try:
|
| 729 |
+
return _react_agent.invoke(
|
| 730 |
+
{"messages": [HumanMessage(content=enriched)]},
|
| 731 |
+
config=build_config(thread_id),
|
| 732 |
+
)
|
| 733 |
+
except Exception as exc:
|
| 734 |
+
if _is_context_overflow_error(exc):
|
| 735 |
+
raise
|
| 736 |
+
if not _is_transient_provider_error(exc):
|
| 737 |
+
raise
|
| 738 |
+
last_exc = exc
|
| 739 |
+
if attempt < PROVIDER_RETRY_ATTEMPTS - 1:
|
| 740 |
+
time.sleep(PROVIDER_RETRY_BASE_DELAY_S * (attempt + 1))
|
| 741 |
+
continue
|
| 742 |
+
raise last_exc
|
| 743 |
+
|
| 744 |
+
# Unreachable, but keeps static type checkers satisfied.
|
| 745 |
+
raise RuntimeError("Unexpected retry flow in _invoke_react_with_retries")
|
| 746 |
+
|
| 747 |
+
|
| 748 |
+
def _parse_review_df(review_df: list[dict]) -> dict:
|
| 749 |
+
"""
|
| 750 |
+
Convert review table rows into theme_map for consolidate_into_themes.
|
| 751 |
+
|
| 752 |
+
Only rows where Approve == True are included.
|
| 753 |
+
Groups cluster IDs by the "Rename To" column value.
|
| 754 |
+
|
| 755 |
+
Parameters
|
| 756 |
+
----------
|
| 757 |
+
review_df : list[dict] — rows from the Gradio Dataframe
|
| 758 |
+
|
| 759 |
+
Returns
|
| 760 |
+
-------
|
| 761 |
+
dict — {theme_name: [cluster_id, ...]}
|
| 762 |
+
"""
|
| 763 |
+
approved = list(filter(lambda r: _truthy(r.get("Approve")), review_df))
|
| 764 |
+
theme_map: dict[str, list[int]] = {}
|
| 765 |
+
|
| 766 |
+
def _add_row(row: dict) -> None:
|
| 767 |
+
name = (row.get("Rename To") or row.get("Topic Label") or "Unnamed").strip()
|
| 768 |
+
cid = int(row.get("#", 0))
|
| 769 |
+
theme_map.setdefault(name, [])
|
| 770 |
+
theme_map[name].append(cid)
|
| 771 |
+
|
| 772 |
+
list(map(_add_row, approved))
|
| 773 |
+
return theme_map
|
| 774 |
+
|
| 775 |
+
|
| 776 |
+
def _extract_charts(run_key: str, state: dict) -> dict:
|
| 777 |
+
"""
|
| 778 |
+
Load chart paths from the run directory and merge into state["charts"].
|
| 779 |
+
Returns existing charts unchanged if the HTML files don't exist yet.
|
| 780 |
+
"""
|
| 781 |
+
rdir = _run_dir(run_key)
|
| 782 |
+
candidates = {
|
| 783 |
+
"Intertopic Map": rdir / "intertopic.html",
|
| 784 |
+
"Top Words": rdir / "topwords.html",
|
| 785 |
+
"Hierarchy": rdir / "hierarchy.html",
|
| 786 |
+
"Heatmap": rdir / "heatmap.html",
|
| 787 |
+
}
|
| 788 |
+
found = {
|
| 789 |
+
k: str(v)
|
| 790 |
+
for k, v in candidates.items()
|
| 791 |
+
if v.exists()
|
| 792 |
+
}
|
| 793 |
+
return {**state.get("charts", {}), **found}
|
| 794 |
+
|
| 795 |
+
|
| 796 |
+
def _collect_output_files(state: dict) -> list[str]:
|
| 797 |
+
"""Gather all generated artefact paths that currently exist on disk."""
|
| 798 |
+
from pathlib import Path as _P
|
| 799 |
+
run_key = state.get("run_key", DEFAULT_RUN_KEY)
|
| 800 |
+
rdir = _run_dir(run_key)
|
| 801 |
+
candidates = [
|
| 802 |
+
str(rdir / "summaries.json"),
|
| 803 |
+
str(rdir / "labels.json"),
|
| 804 |
+
str(rdir / "themes.json"),
|
| 805 |
+
str(rdir / "taxonomy_map.json"),
|
| 806 |
+
str(rdir / "narrative.txt"),
|
| 807 |
+
str(OUTPUT_DIR / "comparison.csv"),
|
| 808 |
+
]
|
| 809 |
+
return list(filter(lambda p: _P(p).exists(), candidates))
|
| 810 |
+
|
| 811 |
+
|
| 812 |
+
def _detect_phase_advance(reply: str, current_phase: int) -> int:
|
| 813 |
+
"""
|
| 814 |
+
Read the agent's STOP / COMPLETE markers and return the updated phase index.
|
| 815 |
+
Phase only advances when the agent emits the correct marker string.
|
| 816 |
+
"""
|
| 817 |
+
markers = {
|
| 818 |
+
"[PHASE 1 COMPLETE — READY FOR PHASE 2]": 1,
|
| 819 |
+
"[STOP GATE 1 — AWAITING REVIEW TABLE SUBMISSION]": 2,
|
| 820 |
+
"[STOP GATE 2 — AWAITING THEME MERGE CONFIRMATION]":3,
|
| 821 |
+
"[STOP GATE 3 — AWAITING SATURATION SIGN-OFF]": 4,
|
| 822 |
+
"[PHASE 5 COMPLETE — READY FOR PAJAIS MAPPING]": 5,
|
| 823 |
+
"[STOP GATE 4 — AWAITING TAXONOMY REVIEW]": 6,
|
| 824 |
+
"[ANALYSIS COMPLETE — ALL PHASES FINISHED]": 8,
|
| 825 |
+
}
|
| 826 |
+
marker_phase = next(
|
| 827 |
+
(v for k, v in markers.items() if k in reply),
|
| 828 |
+
None,
|
| 829 |
+
)
|
| 830 |
+
if marker_phase is not None:
|
| 831 |
+
return max(current_phase, marker_phase)
|
| 832 |
+
|
| 833 |
+
# Fallback: infer from common phase headings when explicit markers are absent.
|
| 834 |
+
text = reply.lower()
|
| 835 |
+
inferred = current_phase
|
| 836 |
+
|
| 837 |
+
inferred = max(
|
| 838 |
+
inferred,
|
| 839 |
+
1 if ("phase 1" in text and "familiar" in text) else current_phase,
|
| 840 |
+
)
|
| 841 |
+
inferred = max(
|
| 842 |
+
inferred,
|
| 843 |
+
2 if ("phase 2" in text and "initial code" in text) else current_phase,
|
| 844 |
+
)
|
| 845 |
+
inferred = max(
|
| 846 |
+
inferred,
|
| 847 |
+
3 if ("phase 3" in text and ("searching" in text or "theme" in text)) else current_phase,
|
| 848 |
+
)
|
| 849 |
+
inferred = max(
|
| 850 |
+
inferred,
|
| 851 |
+
4 if ("phase 4" in text and ("review" in text or "saturation" in text)) else current_phase,
|
| 852 |
+
)
|
| 853 |
+
inferred = max(
|
| 854 |
+
inferred,
|
| 855 |
+
5 if ("phase 5" in text and ("defining" in text or "naming" in text or "definition" in text)) else current_phase,
|
| 856 |
+
)
|
| 857 |
+
inferred = max(
|
| 858 |
+
inferred,
|
| 859 |
+
6 if (("phase 5.5" in text and ("taxonomy" in text or "pajais" in text))
|
| 860 |
+
or ("taxonomy comparison" in text and "pajais" in text))
|
| 861 |
+
else current_phase,
|
| 862 |
+
)
|
| 863 |
+
inferred = max(
|
| 864 |
+
inferred,
|
| 865 |
+
7 if ("phase 6" in text and "report" in text)
|
| 866 |
+
or ("analysis complete" in text and "all phases" in text)
|
| 867 |
+
else current_phase,
|
| 868 |
+
)
|
| 869 |
+
|
| 870 |
+
inferred = max(
|
| 871 |
+
inferred,
|
| 872 |
+
8 if ("analysis complete" in text and "all phases" in text)
|
| 873 |
+
else current_phase,
|
| 874 |
+
)
|
| 875 |
+
|
| 876 |
+
return inferred
|
| 877 |
+
|
| 878 |
+
|
| 879 |
+
def _detect_stop_gate(reply: str) -> str | None:
|
| 880 |
+
"""Return the active stop gate constant from the agent reply, or None."""
|
| 881 |
+
gate_markers = {
|
| 882 |
+
"[STOP GATE 1 — AWAITING REVIEW TABLE SUBMISSION]": GATE_POST_PHASE2,
|
| 883 |
+
"[STOP GATE 2 — AWAITING THEME MERGE CONFIRMATION]":GATE_POST_PHASE3,
|
| 884 |
+
"[STOP GATE 3 — AWAITING SATURATION SIGN-OFF]": GATE_POST_PHASE4,
|
| 885 |
+
"[STOP GATE 4 — AWAITING TAXONOMY REVIEW]": GATE_POST_PHASE55,
|
| 886 |
+
}
|
| 887 |
+
return next(
|
| 888 |
+
(v for k, v in gate_markers.items() if k in reply),
|
| 889 |
+
None,
|
| 890 |
+
)
|
| 891 |
+
|
| 892 |
+
|
| 893 |
+
# ============================================================================
|
| 894 |
+
# FIX ISSUE 3 — populate review_df from labels.json after Phase 2
|
| 895 |
+
# ============================================================================
|
| 896 |
+
|
| 897 |
+
def _populate_review_df(state: dict) -> dict:
|
| 898 |
+
"""
|
| 899 |
+
After label_topics_with_llm() runs, convert labels.json into the review
|
| 900 |
+
table row format expected by app.py's gr.Dataframe.
|
| 901 |
+
|
| 902 |
+
Called whenever labels.json exists but state["review_df"] is still empty.
|
| 903 |
+
|
| 904 |
+
Row schema matches REVIEW_COLUMNS in app.py:
|
| 905 |
+
"#", "Topic Label", "Top Evidence", "Sentences", "Papers",
|
| 906 |
+
"Approve", "Rename To", "Reasoning"
|
| 907 |
+
"""
|
| 908 |
+
labels_path = OUTPUT_DIR / state.get("run_key", DEFAULT_RUN_KEY) / "labels.json"
|
| 909 |
+
|
| 910 |
+
return (
|
| 911 |
+
{
|
| 912 |
+
**state,
|
| 913 |
+
"review_df": list(map(
|
| 914 |
+
lambda r: {
|
| 915 |
+
"#": r.get("cluster_id", 0),
|
| 916 |
+
"Topic Label": r.get("label", ""),
|
| 917 |
+
"Top Evidence":r["evidence"][0] if r.get("evidence") else "",
|
| 918 |
+
"Sentences": r.get("size", 0),
|
| 919 |
+
"Papers": "",
|
| 920 |
+
"Approve": False,
|
| 921 |
+
"Rename To": r.get("label", ""),
|
| 922 |
+
"Reasoning": r.get("reasoning", ""),
|
| 923 |
+
},
|
| 924 |
+
_load_json(labels_path),
|
| 925 |
+
)),
|
| 926 |
+
}
|
| 927 |
+
if labels_path.exists() and not state.get("review_df")
|
| 928 |
+
else state
|
| 929 |
+
)
|
| 930 |
+
|
| 931 |
+
|
| 932 |
+
# ============================================================================
|
| 933 |
+
# Context builder
|
| 934 |
+
# ============================================================================
|
| 935 |
+
|
| 936 |
+
def _build_context_message(user_message: str, state: dict) -> str:
|
| 937 |
+
"""
|
| 938 |
+
Prepend structured pipeline context to every user message so the LLM
|
| 939 |
+
always knows the current phase, gate, and available data without relying
|
| 940 |
+
on its own (potentially stale) memory.
|
| 941 |
+
"""
|
| 942 |
+
context = {
|
| 943 |
+
"current_phase": state.get("phase", 0),
|
| 944 |
+
"phase_label": PHASE_LABELS.get(state.get("phase", 0), "Unknown"),
|
| 945 |
+
"active_stop_gate": state.get("stop_gate"),
|
| 946 |
+
"file_path": state.get("file_path"),
|
| 947 |
+
"run_key": state.get("run_key", DEFAULT_RUN_KEY),
|
| 948 |
+
"review_submitted": bool(state.get("review_df")),
|
| 949 |
+
"theme_map_ready": bool(state.get("theme_map")),
|
| 950 |
+
"charts_available": list(state.get("charts", {}).keys()),
|
| 951 |
+
"output_files_count": len(state.get("output_files", [])),
|
| 952 |
+
}
|
| 953 |
+
ctx_block = json.dumps(context, indent=2)
|
| 954 |
+
return (
|
| 955 |
+
f"```json\n[PIPELINE CONTEXT]\n{ctx_block}\n```\n\n"
|
| 956 |
+
f"**User message:** {user_message}"
|
| 957 |
+
)
|
| 958 |
+
|
| 959 |
+
|
| 960 |
+
# ============================================================================
|
| 961 |
+
# Phase-specific pre-processing
|
| 962 |
+
# ============================================================================
|
| 963 |
+
|
| 964 |
+
def _preprocess_phase3(state: dict) -> tuple[str, dict]:
|
| 965 |
+
"""
|
| 966 |
+
Before Phase 3: parse the submitted review table into theme_map and
|
| 967 |
+
inject it as a context annotation so the agent can call
|
| 968 |
+
consolidate_into_themes() with the correct arguments.
|
| 969 |
+
|
| 970 |
+
Called only when stop_gate == GATE_POST_PHASE2 and review_df is non-empty.
|
| 971 |
+
"""
|
| 972 |
+
theme_map = _parse_review_df(state.get("review_df", []))
|
| 973 |
+
state = {**state, "theme_map": theme_map}
|
| 974 |
+
annotation = (
|
| 975 |
+
f"\n\n[SYSTEM: Review table submitted. "
|
| 976 |
+
f"Parsed theme_map = {json.dumps(theme_map)}. "
|
| 977 |
+
f"Proceed to Phase 3 and call consolidate_into_themes.]"
|
| 978 |
+
)
|
| 979 |
+
return annotation, state
|
| 980 |
+
|
| 981 |
+
|
| 982 |
+
# ============================================================================
|
| 983 |
+
# Public invoke interface
|
| 984 |
+
# ============================================================================
|
| 985 |
+
|
| 986 |
+
class ThematicAnalysisAgent:
|
| 987 |
+
"""
|
| 988 |
+
Thin wrapper around the LangGraph ReAct agent.
|
| 989 |
+
|
| 990 |
+
app.py calls:
|
| 991 |
+
reply, new_state = agent.invoke(user_message, agent_state)
|
| 992 |
+
"""
|
| 993 |
+
|
| 994 |
+
def invoke(self, user_message: str, state: dict) -> tuple[str, dict]:
|
| 995 |
+
"""
|
| 996 |
+
Process one user turn and return (reply_markdown, updated_state).
|
| 997 |
+
|
| 998 |
+
Parameters
|
| 999 |
+
----------
|
| 1000 |
+
user_message : str — raw text from the Gradio chat input
|
| 1001 |
+
state : dict — agent_state from app.py (a new copy is returned)
|
| 1002 |
+
|
| 1003 |
+
Returns
|
| 1004 |
+
-------
|
| 1005 |
+
tuple[str, dict]
|
| 1006 |
+
"""
|
| 1007 |
+
state = _init_state(state)
|
| 1008 |
+
user_message = _trim_user_message(user_message)
|
| 1009 |
+
|
| 1010 |
+
if not MISTRAL_API_KEY:
|
| 1011 |
+
return (
|
| 1012 |
+
"MISTRAL_API_KEY is not set, so the agent cannot run tool-planning LLM calls. "
|
| 1013 |
+
"Set the key and retry.\n\n"
|
| 1014 |
+
"Example:\n"
|
| 1015 |
+
"`export MISTRAL_API_KEY='your-key'`",
|
| 1016 |
+
state,
|
| 1017 |
+
)
|
| 1018 |
+
|
| 1019 |
+
thread_id = state["thread_id"]
|
| 1020 |
+
gate = state.get("stop_gate")
|
| 1021 |
+
|
| 1022 |
+
# FIX BUG 2 — single ternary, no dead lambda block before it
|
| 1023 |
+
extra_context, state = (
|
| 1024 |
+
_preprocess_phase3(state)
|
| 1025 |
+
if (gate == GATE_POST_PHASE2 and state.get("review_df"))
|
| 1026 |
+
else ("", state)
|
| 1027 |
+
)
|
| 1028 |
+
|
| 1029 |
+
# Build enriched message with pipeline context prepended
|
| 1030 |
+
enriched = _build_context_message(user_message + extra_context, state)
|
| 1031 |
+
|
| 1032 |
+
# Invoke the LangGraph ReAct agent
|
| 1033 |
+
try:
|
| 1034 |
+
result = _invoke_react_with_retries(enriched, thread_id)
|
| 1035 |
+
except Exception as exc:
|
| 1036 |
+
if _is_transient_provider_error(exc):
|
| 1037 |
+
return (
|
| 1038 |
+
"Mistral is temporarily unavailable (503/unreachable_backend). "
|
| 1039 |
+
"Automatic retries were attempted. Please retry in 30-60 seconds.",
|
| 1040 |
+
state,
|
| 1041 |
+
)
|
| 1042 |
+
|
| 1043 |
+
if not _is_context_overflow_error(exc):
|
| 1044 |
+
raise
|
| 1045 |
+
|
| 1046 |
+
# Reset the LangGraph thread when context window is exhausted.
|
| 1047 |
+
thread_id = THREAD_PREFIX + uuid.uuid4().hex[:8]
|
| 1048 |
+
state = {
|
| 1049 |
+
**state,
|
| 1050 |
+
"thread_id": thread_id,
|
| 1051 |
+
"context_resets": state.get("context_resets", 0) + 1,
|
| 1052 |
+
}
|
| 1053 |
+
retry_note = (
|
| 1054 |
+
"\n\n[SYSTEM: Previous thread exceeded model context and was reset. "
|
| 1055 |
+
"Continue from pipeline context and saved artifacts.]"
|
| 1056 |
+
)
|
| 1057 |
+
retry_enriched = _build_context_message(
|
| 1058 |
+
user_message + extra_context + retry_note,
|
| 1059 |
+
state,
|
| 1060 |
+
)
|
| 1061 |
+
|
| 1062 |
+
try:
|
| 1063 |
+
result = _invoke_react_with_retries(retry_enriched, thread_id)
|
| 1064 |
+
except Exception as retry_exc:
|
| 1065 |
+
if _is_transient_provider_error(retry_exc):
|
| 1066 |
+
return (
|
| 1067 |
+
"The previous request exceeded model context and the retry hit a "
|
| 1068 |
+
"temporary Mistral outage (503). Please resend your last short "
|
| 1069 |
+
"command in about a minute.",
|
| 1070 |
+
state,
|
| 1071 |
+
)
|
| 1072 |
+
return (
|
| 1073 |
+
"The model context exceeded the provider limit and an automatic "
|
| 1074 |
+
"thread reset retry also failed. Please resend your last command "
|
| 1075 |
+
"(short form) to continue.",
|
| 1076 |
+
state,
|
| 1077 |
+
)
|
| 1078 |
+
|
| 1079 |
+
# Extract the last AIMessage content as the reply
|
| 1080 |
+
ai_messages = [
|
| 1081 |
+
m for m in result.get("messages", [])
|
| 1082 |
+
if hasattr(m, "content") and m.__class__.__name__ == "AIMessage"
|
| 1083 |
+
]
|
| 1084 |
+
reply = (
|
| 1085 |
+
ai_messages[-1].content
|
| 1086 |
+
if ai_messages
|
| 1087 |
+
else "Agent returned no response. Check MISTRAL_API_KEY and retry."
|
| 1088 |
+
)
|
| 1089 |
+
|
| 1090 |
+
# Update state fields derived from the agent's reply
|
| 1091 |
+
new_phase = _detect_phase_advance(reply, state["phase"])
|
| 1092 |
+
new_gate = _detect_stop_gate(reply)
|
| 1093 |
+
new_charts = _extract_charts(state["run_key"], state)
|
| 1094 |
+
new_files = _collect_output_files(state)
|
| 1095 |
+
|
| 1096 |
+
updated_state = {
|
| 1097 |
+
**state,
|
| 1098 |
+
"phase": new_phase,
|
| 1099 |
+
"stop_gate": new_gate,
|
| 1100 |
+
"charts": new_charts,
|
| 1101 |
+
"output_files": new_files,
|
| 1102 |
+
}
|
| 1103 |
+
|
| 1104 |
+
# FIX ISSUE 3 — populate review table rows after Phase 2 labels are ready
|
| 1105 |
+
updated_state = _populate_review_df(updated_state)
|
| 1106 |
+
|
| 1107 |
+
return reply, updated_state
|
| 1108 |
+
|
| 1109 |
+
|
| 1110 |
+
# ============================================================================
|
| 1111 |
+
# Module-level singleton — imported by app.py as `from agent import agent`
|
| 1112 |
+
# ============================================================================
|
| 1113 |
+
|
| 1114 |
+
agent = ThematicAnalysisAgent()
|
| 1115 |
+
|
| 1116 |
+
|
| 1117 |
+
# ============================================================================
|
| 1118 |
+
# CLI smoke-test (python agent.py)
|
| 1119 |
+
# ============================================================================
|
| 1120 |
+
|
| 1121 |
+
if __name__ == "__main__":
|
| 1122 |
+
test_state = {}
|
| 1123 |
+
reply, state = agent.invoke(
|
| 1124 |
+
"Hello — I have just uploaded my Scopus CSV. Please start the analysis.",
|
| 1125 |
+
test_state,
|
| 1126 |
+
)
|
| 1127 |
+
print("=" * 60)
|
| 1128 |
+
print("AGENT REPLY:\n")
|
| 1129 |
+
print(reply)
|
| 1130 |
+
print("\nSTATE:")
|
| 1131 |
+
print(json.dumps(
|
| 1132 |
+
{k: v for k, v in state.items() if k not in ("review_df",)},
|
| 1133 |
+
indent=2, default=str,
|
| 1134 |
+
))
|
app.py
ADDED
|
@@ -0,0 +1,1016 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BERTopic Thematic Analysis Agent — Production Gradio UI
|
| 3 |
+
========================================================
|
| 4 |
+
A dashboard-style Gradio interface for orchestrating BERTopic topic modelling
|
| 5 |
+
via an LLM-backed agent defined in agent.py.
|
| 6 |
+
|
| 7 |
+
Layout
|
| 8 |
+
------
|
| 9 |
+
- Top: Header + Phase progress bar
|
| 10 |
+
- Body: Vertical cards in sequence
|
| 11 |
+
1) Data Input
|
| 12 |
+
2) Agent Console
|
| 13 |
+
3) Results (Tabs: Review | Charts | Downloads)
|
| 14 |
+
|
| 15 |
+
Fixes applied (v2)
|
| 16 |
+
------------------
|
| 17 |
+
- BUG 3 : submit_review() now writes parsed review rows into
|
| 18 |
+
agent_state["review_df"] BEFORE calling the agent, so
|
| 19 |
+
_parse_review_df() in agent.py always receives a populated list.
|
| 20 |
+
- ISSUE 2 : PHASES list updated to 7 labels matching the actual B&C phases
|
| 21 |
+
(was 6 labels misaligned with agent phase 0-6 mapping).
|
| 22 |
+
- ISSUE 4 : Added a startup API-key warning banner rendered in the UI when
|
| 23 |
+
MISTRAL_API_KEY is not set in the environment.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
# Imports
|
| 28 |
+
# ---------------------------------------------------------------------------
|
| 29 |
+
import gradio as gr
|
| 30 |
+
import pandas as pd
|
| 31 |
+
import json
|
| 32 |
+
import os
|
| 33 |
+
import shutil
|
| 34 |
+
import uuid
|
| 35 |
+
from pathlib import Path
|
| 36 |
+
from urllib.parse import quote
|
| 37 |
+
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
# Agent import — graceful stub when agent.py is absent during dev/testing
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
try:
|
| 42 |
+
from agent import agent
|
| 43 |
+
AGENT_AVAILABLE = True
|
| 44 |
+
except ImportError:
|
| 45 |
+
AGENT_AVAILABLE = False
|
| 46 |
+
|
| 47 |
+
class _StubAgent:
|
| 48 |
+
"""Minimal stub so the UI works without agent.py."""
|
| 49 |
+
|
| 50 |
+
def invoke(self, message: str, state: dict) -> tuple[str, dict]:
|
| 51 |
+
reply = (
|
| 52 |
+
f"[STUB] Received: **{message}**\n\n"
|
| 53 |
+
"Connect `agent.py` to get real responses. "
|
| 54 |
+
f"Current phase: `{state.get('phase', 0)}`."
|
| 55 |
+
)
|
| 56 |
+
state["phase"] = min(state.get("phase", 0) + 1, 8)
|
| 57 |
+
return reply, state
|
| 58 |
+
|
| 59 |
+
agent = _StubAgent()
|
| 60 |
+
|
| 61 |
+
# ---------------------------------------------------------------------------
|
| 62 |
+
# Constants
|
| 63 |
+
# ---------------------------------------------------------------------------
|
| 64 |
+
|
| 65 |
+
# FIX ISSUE 2 — 7 labels aligned to the agent's phase 1-6 (index = phase-1)
|
| 66 |
+
PHASES = [
|
| 67 |
+
"Familiarisation", # Phase 1
|
| 68 |
+
"Initial Codes", # Phase 2
|
| 69 |
+
"Themes", # Phase 3
|
| 70 |
+
"Review Themes", # Phase 4
|
| 71 |
+
"Naming", # Phase 5
|
| 72 |
+
"PAJAIS Mapping", # Phase 5.5
|
| 73 |
+
"Report", # Phase 6
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
CHART_OPTIONS = ["Intertopic Map", "Top Words", "Hierarchy", "Heatmap"]
|
| 77 |
+
|
| 78 |
+
REVIEW_COLUMNS = [
|
| 79 |
+
"#", "Topic Label", "Top Evidence", "Sentences", "Papers",
|
| 80 |
+
"Approve", "Rename To", "Reasoning",
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
EMPTY_REVIEW_DF = pd.DataFrame(columns=REVIEW_COLUMNS)
|
| 84 |
+
|
| 85 |
+
# FIX ISSUE 4 — detect missing API key at startup
|
| 86 |
+
API_KEY_MISSING = not bool(os.environ.get("MISTRAL_API_KEY", ""))
|
| 87 |
+
UPLOADS_DIR = Path("uploads")
|
| 88 |
+
OUTPUTS_DIR = Path(__file__).resolve().parent / "outputs"
|
| 89 |
+
|
| 90 |
+
# ---------------------------------------------------------------------------
|
| 91 |
+
# Custom CSS — SaaS dashboard aesthetic
|
| 92 |
+
# ---------------------------------------------------------------------------
|
| 93 |
+
CUSTOM_CSS = """
|
| 94 |
+
/* Fonts */
|
| 95 |
+
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,400&family=DM+Mono:wght@400;500&display=swap');
|
| 96 |
+
|
| 97 |
+
/* Tokens */
|
| 98 |
+
:root {
|
| 99 |
+
--bg-base: #0f1117;
|
| 100 |
+
--bg-surface: #181c27;
|
| 101 |
+
--bg-elevated: #1f2437;
|
| 102 |
+
--bg-hover: #252b3d;
|
| 103 |
+
--border: #2a3048;
|
| 104 |
+
--border-active: #4f6ef7;
|
| 105 |
+
--text-primary: #e8eaf0;
|
| 106 |
+
--text-secondary: #8b92a8;
|
| 107 |
+
--text-muted: #555f7a;
|
| 108 |
+
--accent: #4f6ef7;
|
| 109 |
+
--accent-soft: rgba(79,110,247,0.15);
|
| 110 |
+
--accent-glow: rgba(79,110,247,0.35);
|
| 111 |
+
--success: #34d399;
|
| 112 |
+
--success-soft: rgba(52,211,153,0.15);
|
| 113 |
+
--warning: #fbbf24;
|
| 114 |
+
--warning-soft: rgba(251,191,36,0.15);
|
| 115 |
+
--danger: #f87171;
|
| 116 |
+
--radius-sm: 8px;
|
| 117 |
+
--radius-md: 14px;
|
| 118 |
+
--radius-lg: 20px;
|
| 119 |
+
--shadow-card: 0 4px 24px rgba(0,0,0,0.45), 0 1px 3px rgba(0,0,0,0.3);
|
| 120 |
+
--shadow-button: 0 2px 12px rgba(79,110,247,0.4);
|
| 121 |
+
--font-ui: 'DM Sans', system-ui, sans-serif;
|
| 122 |
+
--font-mono: 'DM Mono', 'Fira Code', monospace;
|
| 123 |
+
--transition: 0.2s cubic-bezier(0.4, 0, 0.2, 1);
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
body, .gradio-container {
|
| 127 |
+
background: var(--bg-base) !important;
|
| 128 |
+
color: var(--text-primary) !important;
|
| 129 |
+
font-family: var(--font-ui) !important;
|
| 130 |
+
}
|
| 131 |
+
.gradio-container { max-width: 1600px !important; padding: 0 !important; }
|
| 132 |
+
|
| 133 |
+
/* Header */
|
| 134 |
+
#app-header {
|
| 135 |
+
background: linear-gradient(135deg, #0f1117 0%, #181c27 50%, #1a1f32 100%);
|
| 136 |
+
border-bottom: 1px solid var(--border);
|
| 137 |
+
padding: 24px 36px 20px;
|
| 138 |
+
position: relative;
|
| 139 |
+
overflow: hidden;
|
| 140 |
+
}
|
| 141 |
+
#app-header::before {
|
| 142 |
+
content: '';
|
| 143 |
+
position: absolute;
|
| 144 |
+
top: -60px; right: -60px;
|
| 145 |
+
width: 240px; height: 240px;
|
| 146 |
+
background: radial-gradient(circle, rgba(79,110,247,0.18) 0%, transparent 70%);
|
| 147 |
+
pointer-events: none;
|
| 148 |
+
}
|
| 149 |
+
#app-header .header-title {
|
| 150 |
+
font-size: 1.7rem; font-weight: 700; letter-spacing: -0.03em;
|
| 151 |
+
color: var(--text-primary); margin: 0 0 4px;
|
| 152 |
+
}
|
| 153 |
+
#app-header .header-subtitle {
|
| 154 |
+
font-size: 0.875rem; color: var(--text-secondary); margin: 0;
|
| 155 |
+
}
|
| 156 |
+
#app-header .header-badge {
|
| 157 |
+
display: inline-flex; align-items: center; gap: 6px;
|
| 158 |
+
background: var(--accent-soft); border: 1px solid var(--accent);
|
| 159 |
+
border-radius: 100px; padding: 3px 12px; font-size: 0.75rem;
|
| 160 |
+
font-weight: 600; color: var(--accent); margin-left: 12px; vertical-align: middle;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
/* API key warning banner */
|
| 164 |
+
.api-warning {
|
| 165 |
+
background: var(--warning-soft);
|
| 166 |
+
border: 1px solid var(--warning);
|
| 167 |
+
border-radius: var(--radius-sm);
|
| 168 |
+
padding: 10px 16px;
|
| 169 |
+
font-size: 0.83rem;
|
| 170 |
+
font-weight: 500;
|
| 171 |
+
color: var(--warning);
|
| 172 |
+
margin: 12px 28px 0;
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
/* Phase progress bar */
|
| 176 |
+
.phase-bar-wrap {
|
| 177 |
+
display: flex; align-items: center; gap: 0;
|
| 178 |
+
margin-top: 20px; position: relative;
|
| 179 |
+
}
|
| 180 |
+
.phase-bar-wrap::before {
|
| 181 |
+
content: '';
|
| 182 |
+
position: absolute;
|
| 183 |
+
left: 20px; right: 20px; top: 50%;
|
| 184 |
+
height: 2px; background: var(--border);
|
| 185 |
+
transform: translateY(-50%); z-index: 0;
|
| 186 |
+
}
|
| 187 |
+
.phase-item {
|
| 188 |
+
display: flex; flex-direction: column;
|
| 189 |
+
align-items: center; flex: 1; position: relative; z-index: 1;
|
| 190 |
+
}
|
| 191 |
+
.phase-dot {
|
| 192 |
+
width: 32px; height: 32px; border-radius: 50%;
|
| 193 |
+
display: flex; align-items: center; justify-content: center;
|
| 194 |
+
font-size: 0.8rem; font-weight: 700;
|
| 195 |
+
border: 2px solid var(--border); background: var(--bg-base);
|
| 196 |
+
transition: all var(--transition);
|
| 197 |
+
}
|
| 198 |
+
.phase-dot.done { background: var(--success-soft); border-color: var(--success); color: var(--success); }
|
| 199 |
+
.phase-dot.active { background: var(--accent-soft); border-color: var(--accent); color: var(--accent);
|
| 200 |
+
box-shadow: 0 0 14px var(--accent-glow); }
|
| 201 |
+
.phase-dot.pending { color: var(--text-muted); }
|
| 202 |
+
.phase-label {
|
| 203 |
+
font-size: 0.65rem; font-weight: 500; color: var(--text-muted);
|
| 204 |
+
margin-top: 6px; text-align: center; letter-spacing: 0.02em; white-space: nowrap;
|
| 205 |
+
}
|
| 206 |
+
.phase-label.active { color: var(--accent); }
|
| 207 |
+
.phase-label.done { color: var(--success); }
|
| 208 |
+
|
| 209 |
+
/* Main body */
|
| 210 |
+
#main-body {
|
| 211 |
+
padding: 22px 28px 32px;
|
| 212 |
+
gap: 16px !important;
|
| 213 |
+
max-width: 1160px;
|
| 214 |
+
margin: 0 auto;
|
| 215 |
+
width: 100%;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
.panel-card {
|
| 219 |
+
background:
|
| 220 |
+
radial-gradient(1200px 260px at 100% -15%, rgba(79,110,247,0.12), transparent 52%),
|
| 221 |
+
linear-gradient(180deg, rgba(31,36,55,0.9) 0%, rgba(24,28,39,0.95) 100%);
|
| 222 |
+
border: 1px solid var(--border);
|
| 223 |
+
border-radius: var(--radius-lg);
|
| 224 |
+
box-shadow: var(--shadow-card);
|
| 225 |
+
padding: 18px 18px 16px;
|
| 226 |
+
position: relative;
|
| 227 |
+
overflow: hidden;
|
| 228 |
+
margin-bottom: 2px;
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
.panel-card:last-child { margin-bottom: 0; }
|
| 232 |
+
|
| 233 |
+
.panel-card::after {
|
| 234 |
+
content: '';
|
| 235 |
+
position: absolute;
|
| 236 |
+
inset: 0;
|
| 237 |
+
background: linear-gradient(120deg, rgba(255,255,255,0.02), transparent 25%, transparent 75%, rgba(255,255,255,0.02));
|
| 238 |
+
pointer-events: none;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
.panel-data { margin-bottom: 2px; }
|
| 242 |
+
.panel-chat { margin-bottom: 2px; }
|
| 243 |
+
|
| 244 |
+
/* Card titles */
|
| 245 |
+
.card-title {
|
| 246 |
+
font-size: 0.74rem; font-weight: 700; letter-spacing: 0.1em;
|
| 247 |
+
text-transform: uppercase; color: var(--text-muted);
|
| 248 |
+
margin: 0 0 16px; display: flex; align-items: center; gap: 10px;
|
| 249 |
+
border-bottom: 1px solid var(--border);
|
| 250 |
+
padding-bottom: 12px;
|
| 251 |
+
}
|
| 252 |
+
.card-title::before {
|
| 253 |
+
content: '';
|
| 254 |
+
width: 8px;
|
| 255 |
+
height: 8px;
|
| 256 |
+
border-radius: 50%;
|
| 257 |
+
background: var(--accent);
|
| 258 |
+
box-shadow: 0 0 10px var(--accent-glow);
|
| 259 |
+
}
|
| 260 |
+
.card-title span { font-size: 1.02rem; color: var(--text-primary); letter-spacing: 0.01em; }
|
| 261 |
+
|
| 262 |
+
/* Stats */
|
| 263 |
+
.stats-grid {
|
| 264 |
+
display: grid; grid-template-columns: 1fr 1fr; gap: 10px; margin-top: 12px;
|
| 265 |
+
}
|
| 266 |
+
.stat-card {
|
| 267 |
+
background: var(--bg-elevated); border: 1px solid var(--border);
|
| 268 |
+
border-radius: var(--radius-sm); padding: 12px 14px;
|
| 269 |
+
}
|
| 270 |
+
.stat-value { font-size: 1.4rem; font-weight: 700; color: var(--text-primary); line-height: 1; }
|
| 271 |
+
.stat-label { font-size: 0.72rem; color: var(--text-muted); margin-top: 4px; text-transform: uppercase; letter-spacing: 0.05em; }
|
| 272 |
+
.stat-card.accent .stat-value { color: var(--accent); }
|
| 273 |
+
.stat-card.success .stat-value { color: var(--success); }
|
| 274 |
+
|
| 275 |
+
/* Status pill */
|
| 276 |
+
.status-pill {
|
| 277 |
+
display: inline-flex; align-items: center; gap: 6px;
|
| 278 |
+
padding: 5px 12px; border-radius: 100px; font-size: 0.78rem; font-weight: 600; margin-top: 12px;
|
| 279 |
+
}
|
| 280 |
+
.status-pill.idle { background: rgba(139,146,168,0.12); color: var(--text-secondary); }
|
| 281 |
+
.status-pill.ready { background: var(--success-soft); color: var(--success); }
|
| 282 |
+
.status-pill.working { background: var(--accent-soft); color: var(--accent); }
|
| 283 |
+
.status-pill .dot { width: 7px; height: 7px; border-radius: 50%; background: currentColor; }
|
| 284 |
+
.status-pill.working .dot { animation: pulse-dot 1.2s ease-in-out infinite; }
|
| 285 |
+
@keyframes pulse-dot {
|
| 286 |
+
0%, 100% { opacity: 1; transform: scale(1); }
|
| 287 |
+
50% { opacity: 0.4; transform: scale(0.7); }
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
/* Chatbot */
|
| 291 |
+
#chatbot-container .chatbot {
|
| 292 |
+
background: var(--bg-elevated) !important;
|
| 293 |
+
border: 1px solid var(--border) !important;
|
| 294 |
+
border-radius: var(--radius-md) !important;
|
| 295 |
+
}
|
| 296 |
+
.message.user {
|
| 297 |
+
background: var(--accent-soft) !important;
|
| 298 |
+
border: 1px solid rgba(79,110,247,0.2) !important;
|
| 299 |
+
border-radius: 14px 14px 4px 14px !important;
|
| 300 |
+
color: var(--text-primary) !important;
|
| 301 |
+
font-size: 0.875rem !important;
|
| 302 |
+
}
|
| 303 |
+
.message.bot {
|
| 304 |
+
background: var(--bg-elevated) !important;
|
| 305 |
+
border: 1px solid var(--border) !important;
|
| 306 |
+
border-radius: 14px 14px 14px 4px !important;
|
| 307 |
+
color: var(--text-primary) !important;
|
| 308 |
+
font-size: 0.875rem !important;
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
/* Chat input */
|
| 312 |
+
#chat-input-row { display: flex; gap: 10px; margin-top: 12px; align-items: flex-end; }
|
| 313 |
+
#chat-input-row textarea {
|
| 314 |
+
background: var(--bg-elevated) !important; border: 1px solid var(--border) !important;
|
| 315 |
+
border-radius: var(--radius-md) !important; color: var(--text-primary) !important;
|
| 316 |
+
font-family: var(--font-ui) !important; font-size: 0.875rem !important;
|
| 317 |
+
resize: none !important; transition: border-color var(--transition) !important;
|
| 318 |
+
}
|
| 319 |
+
#chat-input-row textarea:focus {
|
| 320 |
+
border-color: var(--accent) !important;
|
| 321 |
+
box-shadow: 0 0 0 3px var(--accent-soft) !important;
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
/* Buttons */
|
| 325 |
+
.btn-primary {
|
| 326 |
+
background: var(--accent) !important; border: none !important;
|
| 327 |
+
border-radius: var(--radius-sm) !important; color: #fff !important;
|
| 328 |
+
font-family: var(--font-ui) !important; font-weight: 600 !important;
|
| 329 |
+
font-size: 0.875rem !important; padding: 10px 20px !important;
|
| 330 |
+
cursor: pointer !important; box-shadow: var(--shadow-button) !important;
|
| 331 |
+
transition: all var(--transition) !important; white-space: nowrap;
|
| 332 |
+
}
|
| 333 |
+
.btn-primary:hover {
|
| 334 |
+
background: #3d5de6 !important;
|
| 335 |
+
box-shadow: 0 4px 20px rgba(79,110,247,0.55) !important;
|
| 336 |
+
transform: translateY(-1px) !important;
|
| 337 |
+
}
|
| 338 |
+
.btn-primary:disabled { opacity: 0.45 !important; cursor: not-allowed !important; transform: none !important; }
|
| 339 |
+
|
| 340 |
+
.btn-secondary {
|
| 341 |
+
background: var(--bg-elevated) !important; border: 1px solid var(--border) !important;
|
| 342 |
+
border-radius: var(--radius-sm) !important; color: var(--text-secondary) !important;
|
| 343 |
+
font-family: var(--font-ui) !important; font-weight: 500 !important;
|
| 344 |
+
font-size: 0.875rem !important; padding: 10px 18px !important;
|
| 345 |
+
cursor: pointer !important; transition: all var(--transition) !important;
|
| 346 |
+
}
|
| 347 |
+
.btn-secondary:hover {
|
| 348 |
+
background: var(--bg-hover) !important; border-color: var(--accent) !important;
|
| 349 |
+
color: var(--text-primary) !important;
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
.btn-success {
|
| 353 |
+
background: rgba(52,211,153,0.15) !important; border: 1px solid var(--success) !important;
|
| 354 |
+
border-radius: var(--radius-sm) !important; color: var(--success) !important;
|
| 355 |
+
font-family: var(--font-ui) !important; font-weight: 600 !important;
|
| 356 |
+
font-size: 0.875rem !important; padding: 10px 20px !important;
|
| 357 |
+
cursor: pointer !important; transition: all var(--transition) !important;
|
| 358 |
+
}
|
| 359 |
+
.btn-success:hover { background: rgba(52,211,153,0.25) !important; box-shadow: 0 2px 14px rgba(52,211,153,0.3) !important; }
|
| 360 |
+
|
| 361 |
+
/* Tabs */
|
| 362 |
+
.tabs > .tab-nav {
|
| 363 |
+
background: var(--bg-elevated) !important; border-bottom: 1px solid var(--border) !important;
|
| 364 |
+
border-radius: var(--radius-md) var(--radius-md) 0 0 !important;
|
| 365 |
+
padding: 6px 6px 0 !important; gap: 4px !important;
|
| 366 |
+
}
|
| 367 |
+
.tabs > .tab-nav button {
|
| 368 |
+
background: transparent !important; border: none !important;
|
| 369 |
+
color: var(--text-muted) !important; font-family: var(--font-ui) !important;
|
| 370 |
+
font-size: 0.8rem !important; font-weight: 600 !important;
|
| 371 |
+
letter-spacing: 0.04em !important; padding: 8px 16px !important;
|
| 372 |
+
border-radius: var(--radius-sm) var(--radius-sm) 0 0 !important;
|
| 373 |
+
transition: all var(--transition) !important; cursor: pointer !important;
|
| 374 |
+
}
|
| 375 |
+
.tabs > .tab-nav button:hover { color: var(--text-primary) !important; background: var(--bg-hover) !important; }
|
| 376 |
+
.tabs > .tab-nav button.selected {
|
| 377 |
+
color: var(--accent) !important; background: var(--accent-soft) !important;
|
| 378 |
+
box-shadow: inset 0 -2px 0 var(--accent) !important;
|
| 379 |
+
}
|
| 380 |
+
.tabitem {
|
| 381 |
+
background: var(--bg-elevated) !important; border: 1px solid var(--border) !important;
|
| 382 |
+
border-top: none !important; border-radius: 0 0 var(--radius-md) var(--radius-md) !important;
|
| 383 |
+
padding: 16px !important;
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
/* Dataframe */
|
| 387 |
+
.dataframe-wrap table {
|
| 388 |
+
font-family: var(--font-mono) !important;
|
| 389 |
+
font-size: 0.78rem !important;
|
| 390 |
+
border-collapse: collapse !important;
|
| 391 |
+
width: 100% !important;
|
| 392 |
+
table-layout: fixed !important;
|
| 393 |
+
}
|
| 394 |
+
.dataframe-wrap th {
|
| 395 |
+
background: var(--bg-elevated) !important; color: var(--text-muted) !important;
|
| 396 |
+
font-family: var(--font-ui) !important; font-size: 0.72rem !important;
|
| 397 |
+
font-weight: 600 !important; letter-spacing: 0.06em !important;
|
| 398 |
+
text-transform: uppercase !important; padding: 10px 12px !important;
|
| 399 |
+
border-bottom: 1px solid var(--border) !important;
|
| 400 |
+
}
|
| 401 |
+
.dataframe-wrap td {
|
| 402 |
+
background: var(--bg-surface) !important; color: var(--text-primary) !important;
|
| 403 |
+
padding: 9px 12px !important; border-bottom: 1px solid var(--border) !important;
|
| 404 |
+
line-height: 1.35 !important;
|
| 405 |
+
vertical-align: top !important;
|
| 406 |
+
}
|
| 407 |
+
.dataframe-wrap th,
|
| 408 |
+
.dataframe-wrap td {
|
| 409 |
+
white-space: nowrap !important;
|
| 410 |
+
}
|
| 411 |
+
.dataframe-wrap td > div,
|
| 412 |
+
.dataframe-wrap td > span,
|
| 413 |
+
.dataframe-wrap td > p {
|
| 414 |
+
display: block !important;
|
| 415 |
+
max-width: 100% !important;
|
| 416 |
+
white-space: nowrap !important;
|
| 417 |
+
overflow: hidden !important;
|
| 418 |
+
text-overflow: ellipsis !important;
|
| 419 |
+
cursor: pointer !important;
|
| 420 |
+
}
|
| 421 |
+
.dataframe-wrap td:focus-within > div,
|
| 422 |
+
.dataframe-wrap td:focus-within > span,
|
| 423 |
+
.dataframe-wrap td:focus-within > p {
|
| 424 |
+
white-space: normal !important;
|
| 425 |
+
overflow-wrap: anywhere !important;
|
| 426 |
+
word-break: break-word !important;
|
| 427 |
+
max-height: 9em !important;
|
| 428 |
+
overflow-y: auto !important;
|
| 429 |
+
padding-right: 2px !important;
|
| 430 |
+
}
|
| 431 |
+
.dataframe-wrap textarea,
|
| 432 |
+
.dataframe-wrap input[type="text"] {
|
| 433 |
+
white-space: pre-wrap !important;
|
| 434 |
+
overflow-wrap: anywhere !important;
|
| 435 |
+
word-break: break-word !important;
|
| 436 |
+
}
|
| 437 |
+
.dataframe-wrap textarea {
|
| 438 |
+
min-height: 38px !important;
|
| 439 |
+
height: 38px !important;
|
| 440 |
+
max-height: 160px !important;
|
| 441 |
+
overflow-y: auto !important;
|
| 442 |
+
resize: vertical !important;
|
| 443 |
+
}
|
| 444 |
+
.dataframe-wrap tr:hover td { background: var(--bg-hover) !important; }
|
| 445 |
+
.dataframe-wrap input[type="checkbox"] {
|
| 446 |
+
appearance: auto !important;
|
| 447 |
+
accent-color: var(--accent) !important;
|
| 448 |
+
cursor: pointer !important;
|
| 449 |
+
width: 16px;
|
| 450 |
+
height: 16px;
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
/* Chart frame */
|
| 454 |
+
.chart-frame {
|
| 455 |
+
width: 100%; min-height: 420px; border: 1px solid var(--border);
|
| 456 |
+
border-radius: var(--radius-md); background: var(--bg-elevated); overflow: hidden;
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
/* Vertical card spacing on small screens */
|
| 460 |
+
@media (max-width: 900px) {
|
| 461 |
+
#main-body {
|
| 462 |
+
padding: 14px 12px 20px;
|
| 463 |
+
gap: 12px !important;
|
| 464 |
+
}
|
| 465 |
+
.panel-card {
|
| 466 |
+
padding: 14px 12px;
|
| 467 |
+
border-radius: var(--radius-md);
|
| 468 |
+
}
|
| 469 |
+
.chart-frame { min-height: 320px; }
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
/* Download list */
|
| 473 |
+
.file-list-item {
|
| 474 |
+
display: flex; align-items: center; gap: 10px;
|
| 475 |
+
background: var(--bg-elevated); border: 1px solid var(--border);
|
| 476 |
+
border-radius: var(--radius-sm); padding: 10px 14px; margin-bottom: 8px;
|
| 477 |
+
transition: all var(--transition);
|
| 478 |
+
}
|
| 479 |
+
.file-list-item:hover { border-color: var(--accent); background: var(--bg-hover); }
|
| 480 |
+
.file-icon { font-size: 1.1rem; }
|
| 481 |
+
.file-name { font-size: 0.83rem; color: var(--text-primary); flex: 1; font-family: var(--font-mono); }
|
| 482 |
+
.file-size { font-size: 0.72rem; color: var(--text-muted); }
|
| 483 |
+
|
| 484 |
+
/* Misc Gradio overrides */
|
| 485 |
+
label, .label-wrap { color: var(--text-secondary) !important; font-family: var(--font-ui) !important; font-size: 0.8rem !important; }
|
| 486 |
+
input:not([type="checkbox"]), textarea { background: var(--bg-elevated) !important; color: var(--text-primary) !important; border-color: var(--border) !important; }
|
| 487 |
+
.gr-form:not(.panel-card), .gr-box:not(.panel-card) { background: transparent !important; border: none !important; }
|
| 488 |
+
footer { display: none !important; }
|
| 489 |
+
select { background: var(--bg-elevated) !important; border: 1px solid var(--border) !important; border-radius: var(--radius-sm) !important; color: var(--text-primary) !important; font-family: var(--font-ui) !important; font-size: 0.875rem !important; padding: 8px 12px !important; }
|
| 490 |
+
|
| 491 |
+
/* Animations */
|
| 492 |
+
.fade-in { animation: fadeIn 0.35s ease-out both; }
|
| 493 |
+
@keyframes fadeIn { from { opacity: 0; transform: translateY(8px); } to { opacity: 1; transform: none; } }
|
| 494 |
+
|
| 495 |
+
/* Scrollbar */
|
| 496 |
+
::-webkit-scrollbar { width: 6px; height: 6px; }
|
| 497 |
+
::-webkit-scrollbar-track { background: var(--bg-base); }
|
| 498 |
+
::-webkit-scrollbar-thumb { background: #2d3550; border-radius: 3px; }
|
| 499 |
+
::-webkit-scrollbar-thumb:hover { background: #3d4770; }
|
| 500 |
+
"""
|
| 501 |
+
|
| 502 |
+
# ---------------------------------------------------------------------------
|
| 503 |
+
# Helper — build phase-progress HTML
|
| 504 |
+
# FIX ISSUE 2 — phase index maps correctly to 7-item PHASES list
|
| 505 |
+
# ---------------------------------------------------------------------------
|
| 506 |
+
def build_phase_html(current_phase: int) -> str:
|
| 507 |
+
"""
|
| 508 |
+
Render the 7-step phase progress bar.
|
| 509 |
+
current_phase is the agent's phase (1-7); phase 0 = no phase started yet.
|
| 510 |
+
Phase 8 indicates full completion and renders all 7 steps as done.
|
| 511 |
+
"""
|
| 512 |
+
items = []
|
| 513 |
+
for i, label in enumerate(PHASES):
|
| 514 |
+
phase_number = i + 1 # phases are 1-indexed
|
| 515 |
+
if phase_number < current_phase:
|
| 516 |
+
dot_cls, lbl_cls, icon = "done", "done", "v"
|
| 517 |
+
elif phase_number == current_phase:
|
| 518 |
+
dot_cls, lbl_cls, icon = "active", "active", str(phase_number)
|
| 519 |
+
else:
|
| 520 |
+
dot_cls, lbl_cls, icon = "pending", "", str(phase_number)
|
| 521 |
+
|
| 522 |
+
items.append(f"""
|
| 523 |
+
<div class="phase-item">
|
| 524 |
+
<div class="phase-dot {dot_cls}">{icon}</div>
|
| 525 |
+
<div class="phase-label {lbl_cls}">{label}</div>
|
| 526 |
+
</div>""")
|
| 527 |
+
|
| 528 |
+
inner = "\n".join(items)
|
| 529 |
+
return f"""
|
| 530 |
+
<div id="app-header">
|
| 531 |
+
<div style="display:flex;align-items:baseline;gap:4px;">
|
| 532 |
+
<span class="header-title">BERTopic Thematic Analysis Agent</span>
|
| 533 |
+
<span class="header-badge">AI-Powered</span>
|
| 534 |
+
</div>
|
| 535 |
+
<p class="header-subtitle">
|
| 536 |
+
End-to-end topic modelling — upload a Scopus corpus, run the agent, review topics.
|
| 537 |
+
</p>
|
| 538 |
+
<div class="phase-bar-wrap">
|
| 539 |
+
{inner}
|
| 540 |
+
</div>
|
| 541 |
+
</div>"""
|
| 542 |
+
|
| 543 |
+
|
| 544 |
+
# ---------------------------------------------------------------------------
|
| 545 |
+
# Helper — dataset stats HTML
|
| 546 |
+
# ---------------------------------------------------------------------------
|
| 547 |
+
def build_stats_html(rows: int, cols: int, filename: str) -> str:
|
| 548 |
+
return f"""
|
| 549 |
+
<div class="stats-grid fade-in">
|
| 550 |
+
<div class="stat-card accent">
|
| 551 |
+
<div class="stat-value">{rows:,}</div>
|
| 552 |
+
<div class="stat-label">Rows</div>
|
| 553 |
+
</div>
|
| 554 |
+
<div class="stat-card">
|
| 555 |
+
<div class="stat-value">{cols}</div>
|
| 556 |
+
<div class="stat-label">Columns</div>
|
| 557 |
+
</div>
|
| 558 |
+
</div>
|
| 559 |
+
<div class="status-pill ready" style="margin-top:14px;">
|
| 560 |
+
<div class="dot"></div>
|
| 561 |
+
{filename}
|
| 562 |
+
</div>"""
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
# ---------------------------------------------------------------------------
|
| 566 |
+
# Helper — download file-list HTML
|
| 567 |
+
# ---------------------------------------------------------------------------
|
| 568 |
+
def build_file_list_html(paths: list[str]) -> str:
|
| 569 |
+
if not paths:
|
| 570 |
+
return "<p style='color:var(--text-muted);font-size:0.83rem;padding:8px 0;'>No files generated yet.</p>"
|
| 571 |
+
icons = {".csv": "CSV", ".json": "JSON", ".html": "HTML", ".png": "IMG", ".xlsx": "XLS", ".txt": "TXT"}
|
| 572 |
+
items = []
|
| 573 |
+
for p in paths:
|
| 574 |
+
p = Path(p)
|
| 575 |
+
ext = p.suffix.lower()
|
| 576 |
+
icon = icons.get(ext, "FILE")
|
| 577 |
+
size = ""
|
| 578 |
+
if p.exists():
|
| 579 |
+
b = p.stat().st_size
|
| 580 |
+
size = f"{b/1024:.1f} KB" if b < 1_048_576 else f"{b/1_048_576:.1f} MB"
|
| 581 |
+
items.append(f"""
|
| 582 |
+
<div class="file-list-item fade-in">
|
| 583 |
+
<span class="file-icon" style="font-size:0.7rem;background:var(--accent-soft);color:var(--accent);
|
| 584 |
+
padding:2px 5px;border-radius:4px;font-family:var(--font-mono);font-weight:600;">{icon}</span>
|
| 585 |
+
<span class="file-name">{p.name}</span>
|
| 586 |
+
<span class="file-size">{size}</span>
|
| 587 |
+
</div>""")
|
| 588 |
+
return "\n".join(items)
|
| 589 |
+
|
| 590 |
+
|
| 591 |
+
# ---------------------------------------------------------------------------
|
| 592 |
+
# Helper — placeholder chart HTML
|
| 593 |
+
# ---------------------------------------------------------------------------
|
| 594 |
+
def build_placeholder_chart(chart_type: str) -> str:
|
| 595 |
+
colour_map = {
|
| 596 |
+
"Intertopic Map": "#4f6ef7",
|
| 597 |
+
"Top Words": "#34d399",
|
| 598 |
+
"Hierarchy": "#fbbf24",
|
| 599 |
+
"Heatmap": "#f87171",
|
| 600 |
+
}
|
| 601 |
+
col = colour_map.get(chart_type, "#4f6ef7")
|
| 602 |
+
return f"""
|
| 603 |
+
<div class="chart-frame" style="display:flex;align-items:center;justify-content:center;flex-direction:column;gap:10px;">
|
| 604 |
+
<div style="font-size:2rem;color:var(--text-muted);">CHART</div>
|
| 605 |
+
<div style="color:var(--text-secondary);font-size:0.9rem;font-weight:600;">{chart_type}</div>
|
| 606 |
+
<div style="color:var(--text-muted);font-size:0.78rem;">Run the agent to generate this chart.</div>
|
| 607 |
+
<div style="width:180px;height:4px;background:var(--border);border-radius:2px;margin-top:6px;">
|
| 608 |
+
<div style="width:0%;height:4px;background:{col};border-radius:2px;animation:grow 2s ease-in-out infinite alternate;"></div>
|
| 609 |
+
</div>
|
| 610 |
+
</div>
|
| 611 |
+
<style>@keyframes grow {{ from{{width:0%}} to{{width:75%}} }}</style>"""
|
| 612 |
+
|
| 613 |
+
|
| 614 |
+
# ---------------------------------------------------------------------------
|
| 615 |
+
# Core interaction handlers
|
| 616 |
+
# ---------------------------------------------------------------------------
|
| 617 |
+
|
| 618 |
+
def _persist_upload(file_obj) -> Path:
|
| 619 |
+
"""Copy Gradio temp upload to a stable local path and return it."""
|
| 620 |
+
src = Path(file_obj.name)
|
| 621 |
+
UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
|
| 622 |
+
dst = UPLOADS_DIR / f"{uuid.uuid4().hex[:10]}_{src.name}"
|
| 623 |
+
shutil.copy2(src, dst)
|
| 624 |
+
return dst.resolve()
|
| 625 |
+
|
| 626 |
+
def handle_file_upload(file_obj, agent_state):
|
| 627 |
+
"""Parse uploaded CSV, store file_path in state, trigger agent."""
|
| 628 |
+
if file_obj is None:
|
| 629 |
+
return (
|
| 630 |
+
"<p style='color:var(--text-muted);font-size:0.83rem;'>No file selected.</p>",
|
| 631 |
+
"<div class='status-pill idle'><div class='dot'></div>Awaiting upload</div>",
|
| 632 |
+
agent_state,
|
| 633 |
+
build_phase_html(agent_state.get("phase", 0)),
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
try:
|
| 637 |
+
persisted = _persist_upload(file_obj)
|
| 638 |
+
df = pd.read_csv(persisted)
|
| 639 |
+
rows, cols = df.shape
|
| 640 |
+
filename = Path(file_obj.name).name
|
| 641 |
+
stats_html = build_stats_html(rows, cols, filename)
|
| 642 |
+
agent_state["file_path"] = str(persisted)
|
| 643 |
+
agent_state["file_name"] = filename
|
| 644 |
+
agent_state["rows"] = rows
|
| 645 |
+
agent_state["cols"] = cols
|
| 646 |
+
except Exception as exc:
|
| 647 |
+
stats_html = f"<p style='color:var(--danger);font-size:0.83rem;'>Upload error: {exc}</p>"
|
| 648 |
+
|
| 649 |
+
status_html = "<div class='status-pill ready'><div class='dot'></div>File ready</div>"
|
| 650 |
+
phase_html = build_phase_html(agent_state.get("phase", 0))
|
| 651 |
+
return stats_html, status_html, agent_state, phase_html
|
| 652 |
+
|
| 653 |
+
|
| 654 |
+
def handle_chat(user_message: str, chat_history: list, agent_state: dict):
|
| 655 |
+
"""Stream one user turn through the agent."""
|
| 656 |
+
if not user_message.strip():
|
| 657 |
+
yield chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
|
| 658 |
+
return
|
| 659 |
+
|
| 660 |
+
chat_history = chat_history + [
|
| 661 |
+
{"role": "user", "content": user_message},
|
| 662 |
+
{"role": "assistant", "content": "Thinking..."},
|
| 663 |
+
]
|
| 664 |
+
yield chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
|
| 665 |
+
|
| 666 |
+
file_path = agent_state.get("file_path")
|
| 667 |
+
if file_path and not Path(file_path).exists():
|
| 668 |
+
chat_history[-1]["content"] = (
|
| 669 |
+
"Uploaded CSV is no longer available on disk. "
|
| 670 |
+
"Please upload the file again and retry."
|
| 671 |
+
)
|
| 672 |
+
yield chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
|
| 673 |
+
return
|
| 674 |
+
|
| 675 |
+
try:
|
| 676 |
+
reply, agent_state = agent.invoke(user_message, agent_state)
|
| 677 |
+
except Exception as exc:
|
| 678 |
+
reply = f"Agent error: `{exc}`"
|
| 679 |
+
|
| 680 |
+
chat_history[-1]["content"] = reply
|
| 681 |
+
yield chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
|
| 682 |
+
|
| 683 |
+
|
| 684 |
+
def auto_trigger_agent(agent_state: dict, chat_history: list):
|
| 685 |
+
"""Fire an automatic Phase 1 trigger after file upload."""
|
| 686 |
+
filename = agent_state.get("file_name", "uploaded file")
|
| 687 |
+
rows = agent_state.get("rows", 0)
|
| 688 |
+
auto_msg = (
|
| 689 |
+
f"A dataset has been uploaded: **{filename}** ({rows:,} rows). "
|
| 690 |
+
"Please start the thematic analysis pipeline."
|
| 691 |
+
)
|
| 692 |
+
results = []
|
| 693 |
+
for state in handle_chat(auto_msg, chat_history, agent_state):
|
| 694 |
+
results = state
|
| 695 |
+
return results # (chat_history, agent_state, phase_html)
|
| 696 |
+
|
| 697 |
+
|
| 698 |
+
def refresh_review_table(agent_state: dict):
|
| 699 |
+
"""Render the review DataFrame from agent_state."""
|
| 700 |
+
raw = agent_state.get("review_df", [])
|
| 701 |
+
if raw:
|
| 702 |
+
try:
|
| 703 |
+
return gr.update(value=pd.DataFrame(raw), interactive=True)
|
| 704 |
+
except Exception:
|
| 705 |
+
pass
|
| 706 |
+
return gr.update(value=EMPTY_REVIEW_DF.copy(), interactive=True)
|
| 707 |
+
|
| 708 |
+
|
| 709 |
+
def submit_review(review_df, agent_state: dict, chat_history: list):
|
| 710 |
+
"""
|
| 711 |
+
FIX BUG 3 — write parsed review rows into agent_state["review_df"]
|
| 712 |
+
BEFORE calling the agent, so _parse_review_df() receives the populated list.
|
| 713 |
+
"""
|
| 714 |
+
# Store the review table in state so agent.py can read it
|
| 715 |
+
agent_state["review_df"] = review_df.to_dict(orient="records")
|
| 716 |
+
|
| 717 |
+
# Send a short trigger message — the agent reads state, not the payload
|
| 718 |
+
msg = "Review table submitted. Please proceed to Phase 3 and consolidate themes."
|
| 719 |
+
results = []
|
| 720 |
+
for state in handle_chat(msg, chat_history, agent_state):
|
| 721 |
+
results = state
|
| 722 |
+
new_history, new_state, phase_html = results
|
| 723 |
+
return new_history, new_state, phase_html
|
| 724 |
+
|
| 725 |
+
|
| 726 |
+
def refresh_downloads(agent_state: dict):
|
| 727 |
+
"""Return downloadable artefact paths from agent state."""
|
| 728 |
+
files = agent_state.get("output_files", [])
|
| 729 |
+
html = build_file_list_html(files)
|
| 730 |
+
valid = [f for f in files if os.path.exists(f)]
|
| 731 |
+
return html, valid if valid else None
|
| 732 |
+
|
| 733 |
+
|
| 734 |
+
def get_chart_html(chart_choice: str, agent_state: dict) -> str:
|
| 735 |
+
"""Return chart iframe or placeholder HTML."""
|
| 736 |
+
charts = agent_state.get("charts", {})
|
| 737 |
+
if chart_choice in charts:
|
| 738 |
+
src = charts[chart_choice]
|
| 739 |
+
if os.path.exists(src):
|
| 740 |
+
# Gradio 6 serves local files from /gradio_api/file=..., and
|
| 741 |
+
# paths must be URL-encoded when directories contain spaces.
|
| 742 |
+
normalised = str(Path(src).resolve()).replace("\\", "/")
|
| 743 |
+
encoded = quote(normalised, safe="/:")
|
| 744 |
+
return (
|
| 745 |
+
f'<iframe src="./gradio_api/file={encoded}" '
|
| 746 |
+
'class="chart-frame" frameborder="0"></iframe>'
|
| 747 |
+
)
|
| 748 |
+
return f'<div class="chart-frame fade-in">{src}</div>'
|
| 749 |
+
return build_placeholder_chart(chart_choice)
|
| 750 |
+
|
| 751 |
+
|
| 752 |
+
# ---------------------------------------------------------------------------
|
| 753 |
+
# Build UI
|
| 754 |
+
# ---------------------------------------------------------------------------
|
| 755 |
+
|
| 756 |
+
def build_app() -> gr.Blocks:
|
| 757 |
+
with gr.Blocks(
|
| 758 |
+
title="BERTopic Thematic Analysis Agent",
|
| 759 |
+
) as app:
|
| 760 |
+
|
| 761 |
+
# ── Shared state ──────────────────────────────────────────────────
|
| 762 |
+
agent_state = gr.State({})
|
| 763 |
+
chat_history = gr.State([])
|
| 764 |
+
|
| 765 |
+
# ���─ Header ───────────────────────────────────────────────────────
|
| 766 |
+
phase_bar = gr.HTML(value=build_phase_html(0), elem_id="phase-bar")
|
| 767 |
+
|
| 768 |
+
# FIX ISSUE 4 — show warning banner when API key is missing
|
| 769 |
+
if API_KEY_MISSING:
|
| 770 |
+
gr.HTML(
|
| 771 |
+
"<div class='api-warning'>"
|
| 772 |
+
"WARNING: MISTRAL_API_KEY is not set. "
|
| 773 |
+
"All LLM calls will fail. "
|
| 774 |
+
"Set it in HuggingFace Spaces: Settings -> Variables and secrets."
|
| 775 |
+
"</div>"
|
| 776 |
+
)
|
| 777 |
+
|
| 778 |
+
# ── Main vertical body ────────────────────────────────────────────
|
| 779 |
+
with gr.Column(elem_id="main-body"):
|
| 780 |
+
|
| 781 |
+
with gr.Column(elem_classes=["panel-card", "panel-data"]):
|
| 782 |
+
gr.HTML("""<div class="card-title"><span>Data Input</span></div>""")
|
| 783 |
+
|
| 784 |
+
file_input = gr.File(
|
| 785 |
+
label="Upload Corpus (CSV)",
|
| 786 |
+
file_types=[".csv"],
|
| 787 |
+
interactive=True,
|
| 788 |
+
elem_id="csv-upload",
|
| 789 |
+
)
|
| 790 |
+
|
| 791 |
+
file_status = gr.HTML(
|
| 792 |
+
value="<div class='status-pill idle'><div class='dot'></div>Awaiting upload</div>"
|
| 793 |
+
)
|
| 794 |
+
|
| 795 |
+
dataset_stats = gr.HTML(
|
| 796 |
+
value="<p style='color:var(--text-muted);font-size:0.83rem;"
|
| 797 |
+
"padding:8px 0 0;'>Upload a CSV to see statistics.</p>"
|
| 798 |
+
)
|
| 799 |
+
|
| 800 |
+
gr.HTML("<hr style='border:none;border-top:1px solid var(--border);margin:16px 0;'>")
|
| 801 |
+
gr.HTML("""
|
| 802 |
+
<div style='font-size:0.72rem;color:var(--text-muted);line-height:1.7;'>
|
| 803 |
+
<b style='color:var(--text-secondary);'>Expected columns</b><br>
|
| 804 |
+
Title, Abstract, Authors, Year<br><br>
|
| 805 |
+
<b style='color:var(--text-secondary);'>Quick commands</b><br>
|
| 806 |
+
<code style='font-family:var(--font-mono);'>run abstract</code><br>
|
| 807 |
+
<code style='font-family:var(--font-mono);'>show topics</code><br>
|
| 808 |
+
<code style='font-family:var(--font-mono);'>export results</code>
|
| 809 |
+
</div>""")
|
| 810 |
+
|
| 811 |
+
with gr.Column(elem_classes=["panel-card", "panel-chat"]):
|
| 812 |
+
gr.HTML("""<div class="card-title"><span>Agent Console</span></div>""")
|
| 813 |
+
|
| 814 |
+
chatbot = gr.Chatbot(
|
| 815 |
+
value=[],
|
| 816 |
+
height=470,
|
| 817 |
+
show_label=False,
|
| 818 |
+
avatar_images=(None, None),
|
| 819 |
+
elem_id="chatbot-container",
|
| 820 |
+
)
|
| 821 |
+
|
| 822 |
+
with gr.Row(elem_id="chat-input-row"):
|
| 823 |
+
chat_input = gr.Textbox(
|
| 824 |
+
placeholder='Type a command, e.g. "run abstract" ...',
|
| 825 |
+
show_label=False,
|
| 826 |
+
lines=1,
|
| 827 |
+
scale=5,
|
| 828 |
+
container=False,
|
| 829 |
+
)
|
| 830 |
+
send_btn = gr.Button(
|
| 831 |
+
"Send",
|
| 832 |
+
variant="primary",
|
| 833 |
+
scale=1,
|
| 834 |
+
min_width=90,
|
| 835 |
+
elem_classes=["btn-primary"],
|
| 836 |
+
)
|
| 837 |
+
|
| 838 |
+
with gr.Row():
|
| 839 |
+
clear_btn = gr.Button(
|
| 840 |
+
"Clear Chat",
|
| 841 |
+
variant="secondary",
|
| 842 |
+
scale=1,
|
| 843 |
+
elem_classes=["btn-secondary"],
|
| 844 |
+
)
|
| 845 |
+
|
| 846 |
+
with gr.Column(elem_classes=["panel-card", "panel-results"]):
|
| 847 |
+
gr.HTML("""<div class="card-title"><span>Results</span></div>""")
|
| 848 |
+
|
| 849 |
+
with gr.Tabs(elem_classes=["tabs"]):
|
| 850 |
+
|
| 851 |
+
# ── Tab 1: Review Table ─────────────────────────────
|
| 852 |
+
with gr.TabItem("Review", elem_classes=["tabitem"]):
|
| 853 |
+
gr.HTML("""
|
| 854 |
+
<p style='font-size:0.78rem;color:var(--text-muted);margin:0 0 12px;'>
|
| 855 |
+
Edit <b>Approve</b>, <b>Rename To</b>, and <b>Reasoning</b> columns inline,
|
| 856 |
+
then click <b>Submit Review</b>.
|
| 857 |
+
</p>""")
|
| 858 |
+
|
| 859 |
+
review_table = gr.Dataframe(
|
| 860 |
+
value=EMPTY_REVIEW_DF.copy(),
|
| 861 |
+
headers=REVIEW_COLUMNS,
|
| 862 |
+
datatype=[
|
| 863 |
+
"number", "str", "str", "number", "str",
|
| 864 |
+
"bool", "str", "str",
|
| 865 |
+
],
|
| 866 |
+
interactive=True,
|
| 867 |
+
wrap=True,
|
| 868 |
+
elem_classes=["dataframe-wrap"],
|
| 869 |
+
)
|
| 870 |
+
|
| 871 |
+
with gr.Row():
|
| 872 |
+
refresh_table_btn = gr.Button(
|
| 873 |
+
"Refresh",
|
| 874 |
+
variant="secondary",
|
| 875 |
+
scale=1,
|
| 876 |
+
elem_classes=["btn-secondary"],
|
| 877 |
+
)
|
| 878 |
+
submit_review_btn = gr.Button(
|
| 879 |
+
"Submit Review",
|
| 880 |
+
variant="primary",
|
| 881 |
+
scale=2,
|
| 882 |
+
elem_classes=["btn-success"],
|
| 883 |
+
)
|
| 884 |
+
|
| 885 |
+
# ── Tab 2: Charts ───────────────────────────────────
|
| 886 |
+
with gr.TabItem("Charts", elem_classes=["tabitem"]):
|
| 887 |
+
chart_selector = gr.Dropdown(
|
| 888 |
+
choices=CHART_OPTIONS,
|
| 889 |
+
value=CHART_OPTIONS[0],
|
| 890 |
+
label="Select chart",
|
| 891 |
+
interactive=True,
|
| 892 |
+
)
|
| 893 |
+
chart_display = gr.HTML(
|
| 894 |
+
value=build_placeholder_chart(CHART_OPTIONS[0])
|
| 895 |
+
)
|
| 896 |
+
|
| 897 |
+
# ── Tab 3: Downloads ────────────────────────────────
|
| 898 |
+
with gr.TabItem("Downloads", elem_classes=["tabitem"]):
|
| 899 |
+
gr.HTML("""
|
| 900 |
+
<p style='font-size:0.78rem;color:var(--text-muted);margin:0 0 12px;'>
|
| 901 |
+
Files generated by the agent will appear here automatically.
|
| 902 |
+
</p>""")
|
| 903 |
+
|
| 904 |
+
download_file_list_html = gr.HTML(
|
| 905 |
+
value="<p style='color:var(--text-muted);font-size:0.83rem;'>"
|
| 906 |
+
"No files generated yet.</p>"
|
| 907 |
+
)
|
| 908 |
+
|
| 909 |
+
download_files = gr.File(
|
| 910 |
+
label="",
|
| 911 |
+
file_count="multiple",
|
| 912 |
+
interactive=False,
|
| 913 |
+
)
|
| 914 |
+
|
| 915 |
+
refresh_dl_btn = gr.Button(
|
| 916 |
+
"Refresh Downloads",
|
| 917 |
+
variant="secondary",
|
| 918 |
+
elem_classes=["btn-secondary"],
|
| 919 |
+
)
|
| 920 |
+
|
| 921 |
+
# ────────────────────────────────────────────────────────────────
|
| 922 |
+
# Event wiring
|
| 923 |
+
# ────────────────────────────────────────────────────────────────
|
| 924 |
+
|
| 925 |
+
def _on_file_upload(file_obj, a_state, c_history):
|
| 926 |
+
stats, status, a_state, phase_html = handle_file_upload(file_obj, a_state)
|
| 927 |
+
if file_obj is not None and "file_path" in a_state:
|
| 928 |
+
c_history, a_state, phase_html = auto_trigger_agent(a_state, c_history)
|
| 929 |
+
return stats, status, a_state, phase_html, c_history
|
| 930 |
+
|
| 931 |
+
file_input.change(
|
| 932 |
+
fn=_on_file_upload,
|
| 933 |
+
inputs=[file_input, agent_state, chat_history],
|
| 934 |
+
outputs=[dataset_stats, file_status, agent_state, phase_bar, chatbot],
|
| 935 |
+
)
|
| 936 |
+
|
| 937 |
+
def _on_send(msg, c_history, a_state):
|
| 938 |
+
accumulated = []
|
| 939 |
+
for result in handle_chat(msg, c_history, a_state):
|
| 940 |
+
accumulated = result
|
| 941 |
+
yield accumulated[0], accumulated[1], accumulated[2], ""
|
| 942 |
+
|
| 943 |
+
send_btn.click(
|
| 944 |
+
fn=_on_send,
|
| 945 |
+
inputs=[chat_input, chatbot, agent_state],
|
| 946 |
+
outputs=[chatbot, agent_state, phase_bar, chat_input],
|
| 947 |
+
)
|
| 948 |
+
chat_input.submit(
|
| 949 |
+
fn=_on_send,
|
| 950 |
+
inputs=[chat_input, chatbot, agent_state],
|
| 951 |
+
outputs=[chatbot, agent_state, phase_bar, chat_input],
|
| 952 |
+
)
|
| 953 |
+
|
| 954 |
+
clear_btn.click(
|
| 955 |
+
fn=lambda: ([], {}),
|
| 956 |
+
outputs=[chatbot, agent_state],
|
| 957 |
+
)
|
| 958 |
+
|
| 959 |
+
refresh_table_btn.click(
|
| 960 |
+
fn=refresh_review_table,
|
| 961 |
+
inputs=[agent_state],
|
| 962 |
+
outputs=[review_table],
|
| 963 |
+
)
|
| 964 |
+
|
| 965 |
+
# FIX BUG 3 — submit_review now writes review_df into state first
|
| 966 |
+
submit_review_btn.click(
|
| 967 |
+
fn=submit_review,
|
| 968 |
+
inputs=[review_table, agent_state, chatbot],
|
| 969 |
+
outputs=[chatbot, agent_state, phase_bar],
|
| 970 |
+
)
|
| 971 |
+
|
| 972 |
+
chart_selector.change(
|
| 973 |
+
fn=get_chart_html,
|
| 974 |
+
inputs=[chart_selector, agent_state],
|
| 975 |
+
outputs=[chart_display],
|
| 976 |
+
)
|
| 977 |
+
|
| 978 |
+
refresh_dl_btn.click(
|
| 979 |
+
fn=refresh_downloads,
|
| 980 |
+
inputs=[agent_state],
|
| 981 |
+
outputs=[download_file_list_html, download_files],
|
| 982 |
+
)
|
| 983 |
+
|
| 984 |
+
# Auto-refresh review table, downloads, and the active chart after every chat turn.
|
| 985 |
+
chatbot.change(
|
| 986 |
+
fn=lambda selected_chart, a: (
|
| 987 |
+
refresh_review_table(a),
|
| 988 |
+
*refresh_downloads(a),
|
| 989 |
+
get_chart_html(selected_chart, a),
|
| 990 |
+
),
|
| 991 |
+
inputs=[chart_selector, agent_state],
|
| 992 |
+
outputs=[review_table, download_file_list_html, download_files, chart_display],
|
| 993 |
+
)
|
| 994 |
+
|
| 995 |
+
return app
|
| 996 |
+
|
| 997 |
+
|
| 998 |
+
# ---------------------------------------------------------------------------
|
| 999 |
+
# Entry point
|
| 1000 |
+
# ---------------------------------------------------------------------------
|
| 1001 |
+
if __name__ == "__main__":
|
| 1002 |
+
demo = build_app()
|
| 1003 |
+
demo.launch(
|
| 1004 |
+
server_name="0.0.0.0",
|
| 1005 |
+
server_port=7860,
|
| 1006 |
+
share=False,
|
| 1007 |
+
show_error=True,
|
| 1008 |
+
allowed_paths=[str(OUTPUTS_DIR.resolve())],
|
| 1009 |
+
css=CUSTOM_CSS,
|
| 1010 |
+
theme=gr.themes.Soft(
|
| 1011 |
+
primary_hue=gr.themes.colors.indigo,
|
| 1012 |
+
secondary_hue=gr.themes.colors.slate,
|
| 1013 |
+
neutral_hue=gr.themes.colors.slate,
|
| 1014 |
+
font=[gr.themes.GoogleFont("DM Sans"), "system-ui", "sans-serif"],
|
| 1015 |
+
),
|
| 1016 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
langchain-core
|
| 3 |
+
langchain-mistralai
|
| 4 |
+
langgraph
|
| 5 |
+
sentence-transformers
|
| 6 |
+
scikit-learn
|
| 7 |
+
bertopic
|
| 8 |
+
plotly
|
| 9 |
+
numpy
|
| 10 |
+
pandas
|
| 11 |
+
hdbscan
|
| 12 |
+
umap-learn
|
| 13 |
+
pynndescent
|
tools.py
ADDED
|
@@ -0,0 +1,858 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tools.py — BERTopic Thematic Analysis Pipeline Tools
|
| 3 |
+
=====================================================
|
| 4 |
+
Seven LangChain @tool functions implementing Braun & Clarke's (2006)
|
| 5 |
+
six-phase thematic analysis pipeline.
|
| 6 |
+
|
| 7 |
+
Conventions
|
| 8 |
+
-----------
|
| 9 |
+
- All tools accept / return plain Python dicts (JSON-serialisable).
|
| 10 |
+
- Artefacts are written to OUTPUT_DIR / run_key / <file>.
|
| 11 |
+
- Functional style throughout: map, operator, numpy vectorised ops.
|
| 12 |
+
- No for/while loops, no try/except, no if/else.
|
| 13 |
+
|
| 14 |
+
Fixes applied (v2)
|
| 15 |
+
------------------
|
| 16 |
+
- BUG 1 : run_bertopic_discovery() now saves sent_labels.npy —
|
| 17 |
+
per-sentence cluster-label array required by Tool 4.
|
| 18 |
+
- BUG 1 : consolidate_into_themes() _build_theme() rewritten —
|
| 19 |
+
centroid computed from actual merged-cluster embeddings
|
| 20 |
+
via sent_labels.npy mask (no dead `if False` scaffolding).
|
| 21 |
+
- ISSUE 1: generate_comparison_csv() guards against missing title run
|
| 22 |
+
with a .exists() check instead of hard-crashing.
|
| 23 |
+
|
| 24 |
+
Dependencies
|
| 25 |
+
------------
|
| 26 |
+
pip install langchain langchain-core langchain-mistralai
|
| 27 |
+
sentence-transformers scikit-learn plotly pandas numpy
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
# ---------------------------------------------------------------------------
|
| 31 |
+
# Stdlib
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
import json
|
| 34 |
+
import os
|
| 35 |
+
import re
|
| 36 |
+
import time
|
| 37 |
+
from functools import reduce
|
| 38 |
+
from pathlib import Path
|
| 39 |
+
from operator import itemgetter
|
| 40 |
+
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
# Third-party
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
import numpy as np
|
| 45 |
+
import pandas as pd
|
| 46 |
+
import plotly.express as px
|
| 47 |
+
import plotly.graph_objects as go
|
| 48 |
+
import plotly.figure_factory as ff
|
| 49 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 50 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 51 |
+
from sklearn.preprocessing import normalize
|
| 52 |
+
from sentence_transformers import SentenceTransformer
|
| 53 |
+
|
| 54 |
+
from langchain_core.tools import tool
|
| 55 |
+
from langchain_core.prompts import PromptTemplate
|
| 56 |
+
from langchain_core.output_parsers import JsonOutputParser
|
| 57 |
+
from langchain_mistralai import ChatMistralAI
|
| 58 |
+
|
| 59 |
+
# ---------------------------------------------------------------------------
|
| 60 |
+
# Configuration
|
| 61 |
+
# ---------------------------------------------------------------------------
|
| 62 |
+
|
| 63 |
+
MISTRAL_API_KEY: str = os.environ.get("MISTRAL_API_KEY", "")
|
| 64 |
+
MODEL_NAME: str = "mistral-small-latest"
|
| 65 |
+
EMBED_MODEL: str = "all-MiniLM-L6-v2"
|
| 66 |
+
BASE_DIR: Path = Path(__file__).resolve().parent
|
| 67 |
+
OUTPUT_DIR: Path = BASE_DIR / "outputs"
|
| 68 |
+
N_EVIDENCE: int = 5 # sentences kept per cluster centroid
|
| 69 |
+
DISTANCE_THRESH: float = 0.35 # cosine-distance threshold (1 - similarity)
|
| 70 |
+
RANDOM_SEED: int = 42
|
| 71 |
+
LLM_TIMEOUT_S: int = 45
|
| 72 |
+
LLM_MAX_RETRIES: int = 3
|
| 73 |
+
MAX_LABEL_CLUSTERS: int = 60
|
| 74 |
+
MIN_CLUSTER_SIZE_FOR_LABEL: int = 3
|
| 75 |
+
MAX_TOOL_RETURN_PREVIEW: int = 12
|
| 76 |
+
PROVIDER_RETRY_ATTEMPTS: int = 3
|
| 77 |
+
PROVIDER_RETRY_BASE_DELAY_S: float = 1.5
|
| 78 |
+
|
| 79 |
+
# Run configurations — keys map to source columns
|
| 80 |
+
RUN_CONFIGS: dict[str, list[str]] = {
|
| 81 |
+
"abstract": ["Abstract"],
|
| 82 |
+
"title": ["Title"],
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
# PAJAIS 25-category taxonomy (Pan-Pacific Journal of AIS)
|
| 86 |
+
PAJAIS_TAXONOMY: list[str] = [
|
| 87 |
+
"Artificial Intelligence & Machine Learning",
|
| 88 |
+
"Big Data & Analytics",
|
| 89 |
+
"Blockchain & Distributed Ledger",
|
| 90 |
+
"Cloud Computing & Infrastructure",
|
| 91 |
+
"Cybersecurity & Privacy",
|
| 92 |
+
"Decision Support Systems",
|
| 93 |
+
"Digital Business & E-Commerce",
|
| 94 |
+
"Digital Health & Telemedicine",
|
| 95 |
+
"Digital Innovation & Transformation",
|
| 96 |
+
"Enterprise Systems & ERP",
|
| 97 |
+
"Fintech & Digital Finance",
|
| 98 |
+
"Green IS & Sustainability",
|
| 99 |
+
"Human-Computer Interaction",
|
| 100 |
+
"Information Systems Strategy",
|
| 101 |
+
"IT Governance & Management",
|
| 102 |
+
"Knowledge Management",
|
| 103 |
+
"Mobile Computing & IoT",
|
| 104 |
+
"Natural Language Processing & Text Mining",
|
| 105 |
+
"Organizational Behavior & IS",
|
| 106 |
+
"Platform Ecosystems & APIs",
|
| 107 |
+
"Privacy & Ethics in IS",
|
| 108 |
+
"Smart Cities & Digital Government",
|
| 109 |
+
"Social Media & Collaboration",
|
| 110 |
+
"Supply Chain & Logistics IS",
|
| 111 |
+
"Virtual Reality & Immersive Technologies",
|
| 112 |
+
]
|
| 113 |
+
|
| 114 |
+
# Boilerplate patterns to strip from abstracts
|
| 115 |
+
_BOILERPLATE_RE = re.compile(
|
| 116 |
+
r"(©\s*\d{4}.*?(?:rights reserved|elsevier|springer|wiley)[^.]*\.?)"
|
| 117 |
+
r"|(all rights reserved\.?)"
|
| 118 |
+
r"|(published by.*?(?:ltd|inc|llc)[^.]*\.?)"
|
| 119 |
+
r"|(doi:\s*\S+)",
|
| 120 |
+
re.IGNORECASE,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Sentence splitter — split on sentence-boundary punctuation, keep >= 20 chars
|
| 124 |
+
_SENT_RE = re.compile(r"(?<=[.!?])\s+")
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# ---------------------------------------------------------------------------
|
| 128 |
+
# Private helpers (pure functions, no side-effects)
|
| 129 |
+
# ---------------------------------------------------------------------------
|
| 130 |
+
|
| 131 |
+
def _ensure_dir(path: Path) -> Path:
|
| 132 |
+
path.mkdir(parents=True, exist_ok=True)
|
| 133 |
+
return path
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _run_dir(run_key: str) -> Path:
|
| 137 |
+
return _ensure_dir(OUTPUT_DIR / run_key)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def _clean_text(text: str) -> str:
|
| 141 |
+
return _BOILERPLATE_RE.sub("", str(text)).strip()
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def _split_sentences(text: str) -> list[str]:
|
| 145 |
+
return list(filter(
|
| 146 |
+
lambda s: len(s.strip()) >= 20,
|
| 147 |
+
_SENT_RE.split(_clean_text(text)),
|
| 148 |
+
))
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def _embed(sentences: list[str]) -> np.ndarray:
|
| 152 |
+
"""Encode sentences to L2-normalised 384-d vectors."""
|
| 153 |
+
model = SentenceTransformer(EMBED_MODEL)
|
| 154 |
+
raw = model.encode(sentences, show_progress_bar=False, batch_size=64)
|
| 155 |
+
return normalize(raw, norm="l2") # unit-norm -> cosine = dot product
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def _cluster(embeddings: np.ndarray, threshold: float) -> np.ndarray:
|
| 159 |
+
return AgglomerativeClustering(
|
| 160 |
+
metric="cosine",
|
| 161 |
+
linkage="average",
|
| 162 |
+
distance_threshold=threshold,
|
| 163 |
+
n_clusters=None,
|
| 164 |
+
).fit_predict(embeddings)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def _centroid(embeddings: np.ndarray) -> np.ndarray:
|
| 168 |
+
"""Mean-pool rows then re-normalise to unit length."""
|
| 169 |
+
return normalize(embeddings.mean(axis=0, keepdims=True), norm="l2")[0]
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def _top_k_indices(embeddings: np.ndarray, centroid: np.ndarray, k: int) -> np.ndarray:
|
| 173 |
+
sims = cosine_similarity(embeddings, centroid.reshape(1, -1)).flatten()
|
| 174 |
+
return np.argsort(sims)[::-1][:k]
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def _llm() -> ChatMistralAI:
|
| 178 |
+
return ChatMistralAI(
|
| 179 |
+
model=MODEL_NAME,
|
| 180 |
+
api_key=MISTRAL_API_KEY,
|
| 181 |
+
temperature=0.2,
|
| 182 |
+
random_seed=RANDOM_SEED,
|
| 183 |
+
timeout=LLM_TIMEOUT_S,
|
| 184 |
+
max_retries=LLM_MAX_RETRIES,
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def _is_transient_provider_error(exc: Exception) -> bool:
|
| 189 |
+
"""Detect transient Mistral outages that should be retried."""
|
| 190 |
+
msg = str(exc).lower()
|
| 191 |
+
return (
|
| 192 |
+
"unreachable_backend" in msg
|
| 193 |
+
or "internal server error" in msg
|
| 194 |
+
or '"code":"1100"' in msg
|
| 195 |
+
or '"raw_status_code":503' in msg
|
| 196 |
+
or '"raw_status_code":502' in msg
|
| 197 |
+
or '"raw_status_code":504' in msg
|
| 198 |
+
or "service unavailable" in msg
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def _invoke_with_retries(fn):
|
| 203 |
+
"""Run an LLM call with bounded linear backoff on transient provider errors."""
|
| 204 |
+
last_exc: Exception | None = None
|
| 205 |
+
for attempt in range(PROVIDER_RETRY_ATTEMPTS):
|
| 206 |
+
try:
|
| 207 |
+
return fn()
|
| 208 |
+
except Exception as exc:
|
| 209 |
+
if not _is_transient_provider_error(exc):
|
| 210 |
+
raise
|
| 211 |
+
last_exc = exc
|
| 212 |
+
if attempt < PROVIDER_RETRY_ATTEMPTS - 1:
|
| 213 |
+
time.sleep(PROVIDER_RETRY_BASE_DELAY_S * (attempt + 1))
|
| 214 |
+
continue
|
| 215 |
+
raise last_exc
|
| 216 |
+
|
| 217 |
+
raise RuntimeError("Unexpected retry flow in _invoke_with_retries")
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def _save_json(path: Path, data: object) -> None:
|
| 221 |
+
path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def _load_json(path: Path) -> object:
|
| 225 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
# ---------------------------------------------------------------------------
|
| 229 |
+
# Plotly chart builders
|
| 230 |
+
# ---------------------------------------------------------------------------
|
| 231 |
+
|
| 232 |
+
def _chart_intertopic(summaries: list[dict]) -> go.Figure:
|
| 233 |
+
df = pd.DataFrame(summaries)
|
| 234 |
+
return px.scatter(
|
| 235 |
+
df,
|
| 236 |
+
x="cx", y="cy",
|
| 237 |
+
size="size",
|
| 238 |
+
text="cluster_id",
|
| 239 |
+
color="size",
|
| 240 |
+
color_continuous_scale="Blues",
|
| 241 |
+
title="Intertopic Distance Map",
|
| 242 |
+
labels={"cx": "Dim-1", "cy": "Dim-2", "size": "Sentences"},
|
| 243 |
+
template="plotly_dark",
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def _chart_top_words(summaries: list[dict]) -> go.Figure:
|
| 248 |
+
df = (
|
| 249 |
+
pd.DataFrame(summaries)
|
| 250 |
+
.nlargest(20, "size")
|
| 251 |
+
.assign(label=lambda d: d["cluster_id"].astype(str))
|
| 252 |
+
)
|
| 253 |
+
return px.bar(
|
| 254 |
+
df,
|
| 255 |
+
x="size", y="label",
|
| 256 |
+
orientation="h",
|
| 257 |
+
title="Top Clusters by Sentence Count",
|
| 258 |
+
labels={"size": "Sentences", "label": "Cluster"},
|
| 259 |
+
color="size",
|
| 260 |
+
color_continuous_scale="Teal",
|
| 261 |
+
template="plotly_dark",
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def _chart_hierarchy(labels: list[int], embeddings: np.ndarray) -> go.Figure:
|
| 266 |
+
unique = sorted(set(labels))
|
| 267 |
+
labels_arr = np.array(labels)
|
| 268 |
+
centroids = np.vstack([
|
| 269 |
+
_centroid(embeddings[labels_arr == lbl])
|
| 270 |
+
for lbl in unique
|
| 271 |
+
])
|
| 272 |
+
dist_mat = 1 - cosine_similarity(centroids)
|
| 273 |
+
fig = ff.create_dendrogram(
|
| 274 |
+
dist_mat,
|
| 275 |
+
labels=[str(l) for l in unique],
|
| 276 |
+
colorscale=px.colors.sequential.Blues,
|
| 277 |
+
)
|
| 278 |
+
fig.update_layout(title="Cluster Hierarchy", template="plotly_dark")
|
| 279 |
+
return fig
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def _chart_heatmap(labels: list[int], embeddings: np.ndarray) -> go.Figure:
|
| 283 |
+
unique = sorted(set(labels))
|
| 284 |
+
labels_arr = np.array(labels)
|
| 285 |
+
centroids = np.vstack([
|
| 286 |
+
_centroid(embeddings[labels_arr == lbl])
|
| 287 |
+
for lbl in unique
|
| 288 |
+
])
|
| 289 |
+
sim_mat = cosine_similarity(centroids)
|
| 290 |
+
return px.imshow(
|
| 291 |
+
sim_mat,
|
| 292 |
+
x=[str(l) for l in unique],
|
| 293 |
+
y=[str(l) for l in unique],
|
| 294 |
+
color_continuous_scale="Blues",
|
| 295 |
+
title="Cluster Similarity Heatmap",
|
| 296 |
+
template="plotly_dark",
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def _save_chart(fig: go.Figure, path: Path) -> str:
|
| 301 |
+
fig.write_html(str(path), full_html=True, include_plotlyjs="cdn")
|
| 302 |
+
return str(path)
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
# ============================================================================
|
| 306 |
+
# TOOL 1 — load_scopus_csv
|
| 307 |
+
# ============================================================================
|
| 308 |
+
|
| 309 |
+
@tool
|
| 310 |
+
def load_scopus_csv(filepath: str) -> dict:
|
| 311 |
+
"""
|
| 312 |
+
Load a Scopus-exported CSV and extract corpus statistics.
|
| 313 |
+
|
| 314 |
+
Parameters
|
| 315 |
+
----------
|
| 316 |
+
filepath : str
|
| 317 |
+
Absolute or relative path to the CSV file.
|
| 318 |
+
|
| 319 |
+
Returns
|
| 320 |
+
-------
|
| 321 |
+
dict with keys:
|
| 322 |
+
paper_count, abstract_sentence_count, title_sentence_count,
|
| 323 |
+
columns, sample_abstracts, filepath
|
| 324 |
+
"""
|
| 325 |
+
df = pd.read_csv(filepath).rename(columns=str.strip)
|
| 326 |
+
|
| 327 |
+
abstract_sentences = list(reduce(
|
| 328 |
+
lambda acc, sents: acc + sents,
|
| 329 |
+
map(_split_sentences, df["Abstract"].dropna().tolist()),
|
| 330 |
+
[],
|
| 331 |
+
))
|
| 332 |
+
|
| 333 |
+
title_sentences = list(reduce(
|
| 334 |
+
lambda acc, sents: acc + sents,
|
| 335 |
+
map(_split_sentences, df["Title"].dropna().tolist()),
|
| 336 |
+
[],
|
| 337 |
+
))
|
| 338 |
+
|
| 339 |
+
_ensure_dir(OUTPUT_DIR / "abstract")
|
| 340 |
+
_ensure_dir(OUTPUT_DIR / "title")
|
| 341 |
+
|
| 342 |
+
_save_json(OUTPUT_DIR / "abstract" / "sentences.json", abstract_sentences)
|
| 343 |
+
_save_json(OUTPUT_DIR / "title" / "sentences.json", title_sentences)
|
| 344 |
+
|
| 345 |
+
df.to_csv(OUTPUT_DIR / "corpus.csv", index=False)
|
| 346 |
+
|
| 347 |
+
return {
|
| 348 |
+
"paper_count": int(len(df)),
|
| 349 |
+
"abstract_sentence_count": int(len(abstract_sentences)),
|
| 350 |
+
"title_sentence_count": int(len(title_sentences)),
|
| 351 |
+
"columns": df.columns.tolist(),
|
| 352 |
+
"sample_abstracts": df["Abstract"].dropna().head(3).tolist(),
|
| 353 |
+
"filepath": str(filepath),
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
# ============================================================================
|
| 358 |
+
# TOOL 2 — run_bertopic_discovery
|
| 359 |
+
# ============================================================================
|
| 360 |
+
|
| 361 |
+
@tool
|
| 362 |
+
def run_bertopic_discovery(run_key: str, threshold: float = DISTANCE_THRESH) -> dict:
|
| 363 |
+
"""
|
| 364 |
+
Embed sentences, cluster with AgglomerativeClustering, extract evidence,
|
| 365 |
+
and generate four Plotly charts.
|
| 366 |
+
|
| 367 |
+
Saved artefacts
|
| 368 |
+
---------------
|
| 369 |
+
emb.npy : (N, 384) float32 L2-normalised embeddings
|
| 370 |
+
sent_labels.npy : (N,) int32 per-sentence cluster label [BUG 1 FIX]
|
| 371 |
+
summaries.json : list of cluster dicts with evidence sentences
|
| 372 |
+
|
| 373 |
+
Parameters
|
| 374 |
+
----------
|
| 375 |
+
run_key : str — "abstract" or "title"
|
| 376 |
+
threshold : float — cosine distance threshold for AgglomerativeClustering
|
| 377 |
+
|
| 378 |
+
Returns
|
| 379 |
+
-------
|
| 380 |
+
dict with keys:
|
| 381 |
+
run_key, n_clusters, n_sentences, threshold,
|
| 382 |
+
chart_paths, summaries_path, embeddings_path
|
| 383 |
+
"""
|
| 384 |
+
rdir = _run_dir(run_key)
|
| 385 |
+
sentences = _load_json(OUTPUT_DIR / run_key / "sentences.json")
|
| 386 |
+
|
| 387 |
+
embeddings = _embed(sentences)
|
| 388 |
+
np.save(str(rdir / "emb.npy"), embeddings)
|
| 389 |
+
|
| 390 |
+
labels = _cluster(embeddings, threshold).tolist()
|
| 391 |
+
unique_ids = sorted(set(labels))
|
| 392 |
+
|
| 393 |
+
# FIX BUG 1 — persist per-sentence label array so Tool 4 can build
|
| 394 |
+
# correct cluster masks without any guesswork or scaffolding.
|
| 395 |
+
np.save(str(rdir / "sent_labels.npy"), np.array(labels, dtype=np.int32))
|
| 396 |
+
|
| 397 |
+
labels_arr = np.array(labels)
|
| 398 |
+
|
| 399 |
+
def _cluster_summary(cid: int) -> dict:
|
| 400 |
+
mask = labels_arr == cid
|
| 401 |
+
c_emb = embeddings[mask]
|
| 402 |
+
c_sent = list(np.array(sentences)[mask])
|
| 403 |
+
ctroid = _centroid(c_emb)
|
| 404 |
+
top_idx = _top_k_indices(c_emb, ctroid, N_EVIDENCE)
|
| 405 |
+
return {
|
| 406 |
+
"cluster_id": int(cid),
|
| 407 |
+
"size": int(mask.sum()),
|
| 408 |
+
"cx": float(ctroid[0]),
|
| 409 |
+
"cy": float(ctroid[1]),
|
| 410 |
+
"evidence": list(np.array(c_sent)[top_idx]),
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
summaries = list(map(_cluster_summary, unique_ids))
|
| 414 |
+
_save_json(rdir / "summaries.json", summaries)
|
| 415 |
+
|
| 416 |
+
chart_paths = {
|
| 417 |
+
"Intertopic Map": _save_chart(_chart_intertopic(summaries), rdir / "intertopic.html"),
|
| 418 |
+
"Top Words": _save_chart(_chart_top_words(summaries), rdir / "topwords.html"),
|
| 419 |
+
"Hierarchy": _save_chart(_chart_hierarchy(labels, embeddings), rdir / "hierarchy.html"),
|
| 420 |
+
"Heatmap": _save_chart(_chart_heatmap(labels, embeddings), rdir / "heatmap.html"),
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
return {
|
| 424 |
+
"run_key": run_key,
|
| 425 |
+
"n_clusters": int(len(unique_ids)),
|
| 426 |
+
"n_sentences": int(len(sentences)),
|
| 427 |
+
"threshold": threshold,
|
| 428 |
+
"chart_paths": chart_paths,
|
| 429 |
+
"summaries_path": str(rdir / "summaries.json"),
|
| 430 |
+
"embeddings_path": str(rdir / "emb.npy"),
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
# ============================================================================
|
| 435 |
+
# TOOL 3 — label_topics_with_llm
|
| 436 |
+
# ============================================================================
|
| 437 |
+
|
| 438 |
+
_LABEL_PROMPT = PromptTemplate.from_template(
|
| 439 |
+
"""You are an expert academic researcher specialising in Information Systems.
|
| 440 |
+
|
| 441 |
+
Given the following cluster of research sentences, return a JSON object with EXACTLY these keys:
|
| 442 |
+
label : short research-area name (<= 6 words)
|
| 443 |
+
category : broader IS research category
|
| 444 |
+
confidence : float 0.0-1.0
|
| 445 |
+
reasoning : one sentence explaining your choice
|
| 446 |
+
niche : boolean - true if highly specialised / narrow
|
| 447 |
+
|
| 448 |
+
Cluster ID : {cluster_id}
|
| 449 |
+
Sentence count: {size}
|
| 450 |
+
Evidence sentences:
|
| 451 |
+
{evidence}
|
| 452 |
+
|
| 453 |
+
Respond with RAW JSON only. No markdown, no explanation outside the JSON.
|
| 454 |
+
"""
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
@tool
|
| 459 |
+
def label_topics_with_llm(run_key: str) -> dict:
|
| 460 |
+
"""
|
| 461 |
+
Send each cluster's evidence sentences to Mistral and obtain structured labels.
|
| 462 |
+
|
| 463 |
+
Parameters
|
| 464 |
+
----------
|
| 465 |
+
run_key : str — "abstract" or "title"
|
| 466 |
+
|
| 467 |
+
Returns
|
| 468 |
+
-------
|
| 469 |
+
dict with keys:
|
| 470 |
+
run_key, labels_path, labelled_count, labels_preview (list of dicts)
|
| 471 |
+
"""
|
| 472 |
+
rdir = _run_dir(run_key)
|
| 473 |
+
summaries_path = rdir / "summaries.json"
|
| 474 |
+
if not summaries_path.exists():
|
| 475 |
+
return {
|
| 476 |
+
"run_key": run_key,
|
| 477 |
+
"labels_path": str(rdir / "labels.json"),
|
| 478 |
+
"labelled_count": 0,
|
| 479 |
+
"total_clusters": 0,
|
| 480 |
+
"selected_clusters": 0,
|
| 481 |
+
"skipped_clusters": 0,
|
| 482 |
+
"labels_preview": [],
|
| 483 |
+
"error": (
|
| 484 |
+
f"Missing discovery artifact: {summaries_path}. "
|
| 485 |
+
"Run run_bertopic_discovery first for this run_key."
|
| 486 |
+
),
|
| 487 |
+
}
|
| 488 |
+
|
| 489 |
+
summaries = _load_json(summaries_path)
|
| 490 |
+
|
| 491 |
+
ranked = sorted(
|
| 492 |
+
filter(lambda s: s.get("size", 0) >= MIN_CLUSTER_SIZE_FOR_LABEL, summaries),
|
| 493 |
+
key=lambda s: s.get("size", 0),
|
| 494 |
+
reverse=True,
|
| 495 |
+
)
|
| 496 |
+
selected = ranked[:MAX_LABEL_CLUSTERS]
|
| 497 |
+
|
| 498 |
+
chain = _LABEL_PROMPT | _llm() | JsonOutputParser()
|
| 499 |
+
|
| 500 |
+
def _label_one(summary: dict) -> dict:
|
| 501 |
+
result = _invoke_with_retries(lambda: chain.invoke({
|
| 502 |
+
"cluster_id": summary["cluster_id"],
|
| 503 |
+
"size": summary["size"],
|
| 504 |
+
"evidence": "\n".join(
|
| 505 |
+
f" {i+1}. {s}"
|
| 506 |
+
for i, s in enumerate(summary["evidence"])
|
| 507 |
+
),
|
| 508 |
+
}))
|
| 509 |
+
return {**summary, **result}
|
| 510 |
+
|
| 511 |
+
labelled = list(map(_label_one, selected))
|
| 512 |
+
_save_json(rdir / "labels.json", labelled)
|
| 513 |
+
|
| 514 |
+
# Keep tool output compact so the ReAct transcript does not overflow model context.
|
| 515 |
+
preview = list(map(
|
| 516 |
+
lambda r: {
|
| 517 |
+
"cluster_id": r.get("cluster_id"),
|
| 518 |
+
"label": r.get("label"),
|
| 519 |
+
"category": r.get("category"),
|
| 520 |
+
"confidence": r.get("confidence"),
|
| 521 |
+
"size": r.get("size"),
|
| 522 |
+
"niche": r.get("niche", False),
|
| 523 |
+
},
|
| 524 |
+
labelled[:MAX_TOOL_RETURN_PREVIEW],
|
| 525 |
+
))
|
| 526 |
+
|
| 527 |
+
return {
|
| 528 |
+
"run_key": run_key,
|
| 529 |
+
"labels_path": str(rdir / "labels.json"),
|
| 530 |
+
"labelled_count": len(labelled),
|
| 531 |
+
"total_clusters": len(summaries),
|
| 532 |
+
"selected_clusters": len(selected),
|
| 533 |
+
"skipped_clusters": max(0, len(summaries) - len(selected)),
|
| 534 |
+
"labels_preview": preview,
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
# ============================================================================
|
| 539 |
+
# TOOL 4 — consolidate_into_themes
|
| 540 |
+
# ============================================================================
|
| 541 |
+
|
| 542 |
+
@tool
|
| 543 |
+
def consolidate_into_themes(run_key: str, theme_map: dict) -> dict:
|
| 544 |
+
"""
|
| 545 |
+
Merge approved / renamed topics into consolidated themes and recompute
|
| 546 |
+
centroids from the actual merged-cluster embeddings.
|
| 547 |
+
|
| 548 |
+
Parameters
|
| 549 |
+
----------
|
| 550 |
+
run_key : str — "abstract" or "title"
|
| 551 |
+
theme_map : dict — {new_theme_name: [cluster_id, ...], ...}
|
| 552 |
+
Only approved topics need appear here.
|
| 553 |
+
|
| 554 |
+
Returns
|
| 555 |
+
-------
|
| 556 |
+
dict with keys:
|
| 557 |
+
run_key, theme_count, themes_path, themes_preview (list of dicts)
|
| 558 |
+
"""
|
| 559 |
+
rdir = _run_dir(run_key)
|
| 560 |
+
labels_data = _load_json(rdir / "labels.json")
|
| 561 |
+
embeddings = np.load(str(rdir / "emb.npy")) # (N, 384)
|
| 562 |
+
sent_labels = np.load(str(rdir / "sent_labels.npy")) # (N,) — FIX BUG 1
|
| 563 |
+
|
| 564 |
+
# Index label dicts by cluster_id for O(1) lookup
|
| 565 |
+
label_idx = {item["cluster_id"]: item for item in labels_data}
|
| 566 |
+
|
| 567 |
+
def _build_theme(theme_name: str, cids: list[int]) -> dict:
|
| 568 |
+
"""
|
| 569 |
+
Build one consolidated theme from a list of cluster IDs.
|
| 570 |
+
|
| 571 |
+
Evidence : top-N sentences pooled across all merged clusters
|
| 572 |
+
Centroid : L2-normalised mean of all embeddings in the merged set
|
| 573 |
+
Size : total sentence count across merged clusters
|
| 574 |
+
"""
|
| 575 |
+
member_labels = list(map(label_idx.get, cids))
|
| 576 |
+
|
| 577 |
+
# Pool evidence sentences from all member clusters
|
| 578 |
+
all_evidence = reduce(
|
| 579 |
+
lambda acc, lbl: acc + lbl["evidence"],
|
| 580 |
+
filter(None, member_labels),
|
| 581 |
+
[],
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
# Total sentence count across merged clusters
|
| 585 |
+
total_size = reduce(
|
| 586 |
+
lambda acc, lbl: acc + lbl.get("size", 0),
|
| 587 |
+
filter(None, member_labels),
|
| 588 |
+
0,
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
# FIX BUG 1 — build correct cluster mask using persisted sent_labels
|
| 592 |
+
cluster_mask = np.isin(sent_labels, np.array(cids, dtype=np.int32))
|
| 593 |
+
theme_embeddings = embeddings[cluster_mask] # (M, 384)
|
| 594 |
+
|
| 595 |
+
# Guard: if mask is somehow empty fall back to zero vector
|
| 596 |
+
theme_centroid = (
|
| 597 |
+
_centroid(theme_embeddings)
|
| 598 |
+
if theme_embeddings.shape[0] > 0
|
| 599 |
+
else np.zeros(embeddings.shape[1], dtype=np.float32)
|
| 600 |
+
)
|
| 601 |
+
|
| 602 |
+
return {
|
| 603 |
+
"theme_name": theme_name,
|
| 604 |
+
"cluster_ids": cids,
|
| 605 |
+
"size": total_size,
|
| 606 |
+
"evidence": all_evidence[:N_EVIDENCE],
|
| 607 |
+
"centroid": theme_centroid.tolist(),
|
| 608 |
+
"sub_labels": list(map(
|
| 609 |
+
itemgetter("label"),
|
| 610 |
+
filter(None, member_labels),
|
| 611 |
+
)),
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
themes = list(map(
|
| 615 |
+
lambda kv: _build_theme(kv[0], kv[1]),
|
| 616 |
+
theme_map.items(),
|
| 617 |
+
))
|
| 618 |
+
|
| 619 |
+
_save_json(rdir / "themes.json", themes)
|
| 620 |
+
|
| 621 |
+
preview = list(map(
|
| 622 |
+
lambda t: {
|
| 623 |
+
"theme_name": t.get("theme_name"),
|
| 624 |
+
"size": t.get("size", 0),
|
| 625 |
+
"cluster_count": len(t.get("cluster_ids", [])),
|
| 626 |
+
},
|
| 627 |
+
themes[:MAX_TOOL_RETURN_PREVIEW],
|
| 628 |
+
))
|
| 629 |
+
|
| 630 |
+
return {
|
| 631 |
+
"run_key": run_key,
|
| 632 |
+
"theme_count": len(themes),
|
| 633 |
+
"themes_path": str(rdir / "themes.json"),
|
| 634 |
+
"themes_preview": preview,
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
|
| 638 |
+
# ============================================================================
|
| 639 |
+
# TOOL 5 — compare_with_taxonomy
|
| 640 |
+
# ============================================================================
|
| 641 |
+
|
| 642 |
+
_TAXONOMY_PROMPT = PromptTemplate.from_template(
|
| 643 |
+
"""You are an IS research taxonomist. Map the following research theme to the
|
| 644 |
+
PAJAIS taxonomy. Return RAW JSON with EXACTLY these keys:
|
| 645 |
+
theme_name : the input theme name (unchanged)
|
| 646 |
+
pajais_match : best matching PAJAIS category OR the string "NOVEL"
|
| 647 |
+
confidence : float 0.0-1.0
|
| 648 |
+
reasoning : one sentence
|
| 649 |
+
is_novel : boolean
|
| 650 |
+
|
| 651 |
+
PAJAIS categories:
|
| 652 |
+
{taxonomy}
|
| 653 |
+
|
| 654 |
+
Theme to map:
|
| 655 |
+
Name : {theme_name}
|
| 656 |
+
Evidence : {evidence}
|
| 657 |
+
|
| 658 |
+
Respond with RAW JSON only. No markdown.
|
| 659 |
+
"""
|
| 660 |
+
)
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
@tool
|
| 664 |
+
def compare_with_taxonomy(run_key: str) -> dict:
|
| 665 |
+
"""
|
| 666 |
+
Map consolidated themes to PAJAIS taxonomy via Mistral.
|
| 667 |
+
|
| 668 |
+
Parameters
|
| 669 |
+
----------
|
| 670 |
+
run_key : str — "abstract" or "title"
|
| 671 |
+
|
| 672 |
+
Returns
|
| 673 |
+
-------
|
| 674 |
+
dict with keys:
|
| 675 |
+
run_key, taxonomy_path, mapped_count, novel_count, mapping_preview
|
| 676 |
+
"""
|
| 677 |
+
rdir = _run_dir(run_key)
|
| 678 |
+
themes = _load_json(rdir / "themes.json")
|
| 679 |
+
chain = _TAXONOMY_PROMPT | _llm() | JsonOutputParser()
|
| 680 |
+
|
| 681 |
+
taxonomy_str = "\n".join(f" - {cat}" for cat in PAJAIS_TAXONOMY)
|
| 682 |
+
|
| 683 |
+
def _map_theme(theme: dict) -> dict:
|
| 684 |
+
result = _invoke_with_retries(lambda: chain.invoke({
|
| 685 |
+
"taxonomy": taxonomy_str,
|
| 686 |
+
"theme_name": theme["theme_name"],
|
| 687 |
+
"evidence": " | ".join(theme.get("evidence", [])[:3]),
|
| 688 |
+
}))
|
| 689 |
+
return {**theme, **result}
|
| 690 |
+
|
| 691 |
+
taxonomy_map = list(map(_map_theme, themes))
|
| 692 |
+
_save_json(rdir / "taxonomy_map.json", taxonomy_map)
|
| 693 |
+
|
| 694 |
+
novel_count = sum(1 for t in taxonomy_map if t.get("is_novel", False))
|
| 695 |
+
mapped_count = len(taxonomy_map) - novel_count
|
| 696 |
+
|
| 697 |
+
preview = list(map(
|
| 698 |
+
lambda t: {
|
| 699 |
+
"theme_name": t.get("theme_name"),
|
| 700 |
+
"pajais_match": t.get("pajais_match", "NOVEL"),
|
| 701 |
+
"confidence": t.get("confidence", 0),
|
| 702 |
+
"is_novel": t.get("is_novel", False),
|
| 703 |
+
},
|
| 704 |
+
taxonomy_map[:MAX_TOOL_RETURN_PREVIEW],
|
| 705 |
+
))
|
| 706 |
+
|
| 707 |
+
return {
|
| 708 |
+
"run_key": run_key,
|
| 709 |
+
"taxonomy_path": str(rdir / "taxonomy_map.json"),
|
| 710 |
+
"mapped_count": mapped_count,
|
| 711 |
+
"novel_count": novel_count,
|
| 712 |
+
"mapping_preview": preview,
|
| 713 |
+
}
|
| 714 |
+
|
| 715 |
+
|
| 716 |
+
# ============================================================================
|
| 717 |
+
# TOOL 6 — generate_comparison_csv
|
| 718 |
+
# ============================================================================
|
| 719 |
+
|
| 720 |
+
@tool
|
| 721 |
+
def generate_comparison_csv() -> dict:
|
| 722 |
+
"""
|
| 723 |
+
Side-by-side comparison of abstract-run vs title-run themes.
|
| 724 |
+
|
| 725 |
+
FIX ISSUE 1: title run is optional — no longer crashes if only the
|
| 726 |
+
abstract run has been completed. title_map defaults to [] when the
|
| 727 |
+
title taxonomy_map.json file does not exist.
|
| 728 |
+
|
| 729 |
+
Returns
|
| 730 |
+
-------
|
| 731 |
+
dict with keys:
|
| 732 |
+
csv_path, row_count, columns, preview (list of dicts)
|
| 733 |
+
"""
|
| 734 |
+
abstract_path = OUTPUT_DIR / "abstract" / "taxonomy_map.json"
|
| 735 |
+
title_path = OUTPUT_DIR / "title" / "taxonomy_map.json"
|
| 736 |
+
|
| 737 |
+
abstract_map = _load_json(abstract_path)
|
| 738 |
+
|
| 739 |
+
# FIX ISSUE 1: guard against missing title run
|
| 740 |
+
title_map = (
|
| 741 |
+
_load_json(title_path)
|
| 742 |
+
if title_path.exists()
|
| 743 |
+
else []
|
| 744 |
+
)
|
| 745 |
+
|
| 746 |
+
def _row(a_theme: dict, t_theme: dict | None) -> dict:
|
| 747 |
+
return {
|
| 748 |
+
"Abstract Theme": a_theme.get("theme_name", ""),
|
| 749 |
+
"Abstract PAJAIS": a_theme.get("pajais_match", ""),
|
| 750 |
+
"Abstract Confidence": a_theme.get("confidence", 0),
|
| 751 |
+
"Abstract Novel": a_theme.get("is_novel", False),
|
| 752 |
+
"Title Theme": t_theme.get("theme_name", "") if t_theme else "",
|
| 753 |
+
"Title PAJAIS": t_theme.get("pajais_match", "") if t_theme else "",
|
| 754 |
+
"Title Confidence": t_theme.get("confidence", 0) if t_theme else 0,
|
| 755 |
+
"Title Novel": t_theme.get("is_novel", False) if t_theme else False,
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
+
max_len = max(len(abstract_map), len(title_map)) if title_map else len(abstract_map)
|
| 759 |
+
padded_a = abstract_map + [{}] * (max_len - len(abstract_map))
|
| 760 |
+
padded_t = title_map + [{}] * (max_len - len(title_map))
|
| 761 |
+
|
| 762 |
+
rows = list(map(_row, padded_a, padded_t))
|
| 763 |
+
df = pd.DataFrame(rows)
|
| 764 |
+
|
| 765 |
+
out_path = OUTPUT_DIR / "comparison.csv"
|
| 766 |
+
df.to_csv(out_path, index=False)
|
| 767 |
+
|
| 768 |
+
return {
|
| 769 |
+
"csv_path": str(out_path),
|
| 770 |
+
"row_count": len(df),
|
| 771 |
+
"columns": df.columns.tolist(),
|
| 772 |
+
"preview": df.head(5).to_dict(orient="records"),
|
| 773 |
+
}
|
| 774 |
+
|
| 775 |
+
|
| 776 |
+
# ============================================================================
|
| 777 |
+
# TOOL 7 — export_narrative
|
| 778 |
+
# ============================================================================
|
| 779 |
+
|
| 780 |
+
_NARRATIVE_PROMPT = PromptTemplate.from_template(
|
| 781 |
+
"""You are an academic researcher writing a methodology and findings section.
|
| 782 |
+
|
| 783 |
+
Write a 500-word academic narrative describing the thematic analysis results below.
|
| 784 |
+
Structure: (1) methodology overview, (2) major themes found, (3) PAJAIS alignment,
|
| 785 |
+
(4) novel contributions, (5) limitations.
|
| 786 |
+
|
| 787 |
+
Use formal academic English. Do NOT use bullet points.
|
| 788 |
+
|
| 789 |
+
Abstract themes & taxonomy:
|
| 790 |
+
{abstract_themes}
|
| 791 |
+
|
| 792 |
+
Title themes & taxonomy:
|
| 793 |
+
{title_themes}
|
| 794 |
+
|
| 795 |
+
Respond with plain text only.
|
| 796 |
+
"""
|
| 797 |
+
)
|
| 798 |
+
|
| 799 |
+
|
| 800 |
+
@tool
|
| 801 |
+
def export_narrative(run_key: str) -> dict:
|
| 802 |
+
"""
|
| 803 |
+
Generate a 500-word academic narrative and save to narrative.txt.
|
| 804 |
+
|
| 805 |
+
Parameters
|
| 806 |
+
----------
|
| 807 |
+
run_key : str — "abstract" or "title" (primary source)
|
| 808 |
+
|
| 809 |
+
Returns
|
| 810 |
+
-------
|
| 811 |
+
dict with keys:
|
| 812 |
+
narrative_path, word_count, preview (first 300 chars)
|
| 813 |
+
"""
|
| 814 |
+
rdir = _run_dir(run_key)
|
| 815 |
+
title_path = OUTPUT_DIR / "title" / "taxonomy_map.json"
|
| 816 |
+
|
| 817 |
+
abstract_map = _load_json(OUTPUT_DIR / "abstract" / "taxonomy_map.json")
|
| 818 |
+
title_map = _load_json(title_path) if title_path.exists() else []
|
| 819 |
+
|
| 820 |
+
def _theme_summary(t: dict) -> str:
|
| 821 |
+
return (
|
| 822 |
+
f" - {t.get('theme_name','?')} -> {t.get('pajais_match','?')} "
|
| 823 |
+
f"(conf={t.get('confidence',0):.2f}, novel={t.get('is_novel',False)})"
|
| 824 |
+
)
|
| 825 |
+
|
| 826 |
+
abstract_str = "\n".join(map(_theme_summary, abstract_map))
|
| 827 |
+
title_str = "\n".join(map(_theme_summary, title_map)) or "Not run."
|
| 828 |
+
|
| 829 |
+
chain = _NARRATIVE_PROMPT | _llm()
|
| 830 |
+
response = _invoke_with_retries(lambda: chain.invoke({
|
| 831 |
+
"abstract_themes": abstract_str,
|
| 832 |
+
"title_themes": title_str,
|
| 833 |
+
}))
|
| 834 |
+
|
| 835 |
+
narrative = response.content if hasattr(response, "content") else str(response)
|
| 836 |
+
out_path = rdir / "narrative.txt"
|
| 837 |
+
out_path.write_text(narrative, encoding="utf-8")
|
| 838 |
+
|
| 839 |
+
return {
|
| 840 |
+
"narrative_path": str(out_path),
|
| 841 |
+
"word_count": len(narrative.split()),
|
| 842 |
+
"preview": narrative[:300],
|
| 843 |
+
}
|
| 844 |
+
|
| 845 |
+
|
| 846 |
+
# ---------------------------------------------------------------------------
|
| 847 |
+
# Tool registry — imported by agent.py
|
| 848 |
+
# ---------------------------------------------------------------------------
|
| 849 |
+
|
| 850 |
+
ALL_TOOLS = [
|
| 851 |
+
load_scopus_csv,
|
| 852 |
+
run_bertopic_discovery,
|
| 853 |
+
label_topics_with_llm,
|
| 854 |
+
consolidate_into_themes,
|
| 855 |
+
compare_with_taxonomy,
|
| 856 |
+
generate_comparison_csv,
|
| 857 |
+
export_narrative,
|
| 858 |
+
]
|