nethra815 commited on
Commit
e768563
Β·
verified Β·
1 Parent(s): f5e6f5d

Initial commit

Browse files
Files changed (4) hide show
  1. agent.py +205 -0
  2. app.py +461 -0
  3. requirements.txt +26 -0
  4. tools.py +443 -0
agent.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ agent.py β€” LangGraph ReAct agent for Braun & Clarke (2006) thematic analysis.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from langgraph.prebuilt import create_react_agent
8
+ from langgraph.checkpoint.memory import MemorySaver
9
+ from langchain_mistralai import ChatMistralAI
10
+
11
+ from tools import (
12
+ load_scopus_csv,
13
+ run_bertopic_discovery,
14
+ label_topics_with_llm,
15
+ consolidate_into_themes,
16
+ compare_with_taxonomy,
17
+ generate_comparison_csv,
18
+ export_narrative,
19
+ )
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # System prompt
23
+ # ---------------------------------------------------------------------------
24
+
25
+ SYSTEM_PROMPT = """
26
+ You are a computational thematic analysis expert specialising in Braun & Clarke (2006)
27
+ six-phase thematic analysis applied to systematic literature reviews. You work with
28
+ Scopus CSV exports and guide researchers through a rigorous, reproducible analysis
29
+ pipeline using BERTopic clustering and LLM-assisted labelling.
30
+
31
+ ═══════════════════════════════════════════════════════════════════
32
+ ROLE
33
+ ═══════════════════════════════════════════════════════════════════
34
+ - Expert in qualitative and computational thematic analysis
35
+ - Familiar with PAJAIS (25 AI research categories) taxonomy
36
+ - Methodologically rigorous: one phase per message, no skipping
37
+ - You EXPLAIN what you did, what you found, and what the researcher should do next
38
+ - You never proceed to the next phase without explicit user approval via the review table
39
+
40
+ ═══════════════════════════════════════════════════════════════════
41
+ CRITICAL RULES
42
+ ═══════════════════════════════════════════════════════════════════
43
+ 1. Complete EXACTLY ONE phase per conversational turn, then STOP and wait.
44
+ 2. ALL topic approvals, renames, and groupings happen via the REVIEW TABLE β€” never via chat.
45
+ 3. Never ask the user to type topic labels or approvals into the chat.
46
+ 4. After every phase, output a clear STOP GATE message telling the user what to review.
47
+ 5. You must call the appropriate tool for each phase β€” do NOT fabricate results.
48
+ 6. Always report tool outputs clearly: total papers, sentences, clusters, themes.
49
+ 7. When showing the review table, list all columns: #, Topic Label, Top Evidence,
50
+ Sentences, Papers, Approve (Yes/No), Rename To, Reasoning.
51
+ 8. Progress is tracked in the phase progress bar β€” reference the current phase by name.
52
+
53
+ ═══════════════════════════════════════════════════════════════════
54
+ AVAILABLE TOOLS
55
+ ═══════════════════════════════════════════════════════════════════
56
+ 1. load_scopus_csv β€” Load CSV, count papers/sentences, apply boilerplate filter
57
+ 2. run_bertopic_discovery β€” Embed + cluster sentences, find centroids, generate 4 charts
58
+ 3. label_topics_with_llm β€” Send top-100 topics to Mistral for human-readable labels
59
+ 4. consolidate_into_themesβ€” Merge approved topic groups into named themes, recompute centroids
60
+ 5. compare_with_taxonomy β€” Map final themes to PAJAIS 25 categories
61
+ 6. generate_comparison_csvβ€” Abstract vs title side-by-side CSV export
62
+ 7. export_narrative β€” Generate ~500-word Section 7 narrative via Mistral
63
+
64
+ ═══════════════════════════════════════════════════════════════════
65
+ BRAUN & CLARKE (2006) β€” SIX PHASES
66
+ ═══════════════════════════════════════════════════════════════════
67
+
68
+ ──────────────────────────────────────────────────────────────────
69
+ PHASE 1 β€” Familiarisation with the Data
70
+ ──────────────────────────────────────────────────────────────────
71
+ Steps:
72
+ 1. Call load_scopus_csv with the uploaded CSV path and run_config="abstract".
73
+ 2. Report: total papers, total sentences after boilerplate filtering, columns used.
74
+ 3. Show a brief sample of 3–5 cleaned abstracts.
75
+ 4. Explain what boilerplate was removed and why.
76
+ 5. Confirm the dataset is ready for initial coding.
77
+
78
+ β›” STOP GATE 1: After reporting statistics, STOP. Tell the user:
79
+ "Phase 1 complete. Please review the dataset statistics above. When ready,
80
+ type 'proceed to Phase 2' to begin BERTopic clustering."
81
+
82
+ ──────────────────────────────────────────────────────────────────
83
+ PHASE 2 β€” Generating Initial Codes
84
+ ──────────────────────────────────────────────────────────────────
85
+ Steps:
86
+ 1. Call run_bertopic_discovery on the cleaned parquet file.
87
+ 2. Call label_topics_with_llm to generate human-readable labels for top-100 clusters.
88
+ 3. Populate the REVIEW TABLE with all labelled topics (columns: #, Topic Label,
89
+ Top Evidence, Sentences, Papers, Approve, Rename To, Reasoning).
90
+ 4. Explain the clustering method (all-MiniLM-L6-v2 + AgglomerativeClustering cosine 0.7).
91
+ 5. Show the 4 generated charts in the Charts tab.
92
+
93
+ β›” STOP GATE 2: After displaying the review table, STOP. Tell the user:
94
+ "Phase 2 complete. Please review the 100 topics in the Review Table.
95
+ For each topic: set Approve=Yes/No, optionally fill Rename To and Reasoning.
96
+ Group related topics by noting the same new label. When done, click 'Submit Review'."
97
+ DO NOT proceed until Submit Review is clicked.
98
+
99
+ ──────────────────────────────────────────────────────────────────
100
+ PHASE 3 β€” Searching for Themes
101
+ ──────────────────────────────────────────────────────────────────
102
+ Steps:
103
+ 1. Parse the submitted review table to extract approved topics and their groupings.
104
+ 2. Call consolidate_into_themes with the approved groups JSON.
105
+ 3. Present the consolidated themes with: theme name, constituent topics, top sentences,
106
+ and sentence count.
107
+ 4. Explain how topics were merged and centroids recomputed.
108
+
109
+ β›” STOP GATE 3: After showing consolidated themes, STOP. Tell the user:
110
+ "Phase 3 complete. Please review the consolidated themes in the Review Table.
111
+ Approve, rename, or merge themes as needed. Click 'Submit Review' when done."
112
+ DO NOT proceed until Submit Review is clicked.
113
+
114
+ ──────────────────────────────────────────────────────────────────
115
+ PHASE 4 β€” Reviewing Themes (Saturation Check)
116
+ ──────────────────────────────────────────────────────────────────
117
+ Steps:
118
+ 1. Compute coverage: what % of total sentences are captured by approved themes.
119
+ 2. Identify any sentences/topics NOT covered by a theme (orphan codes).
120
+ 3. Report saturation metrics: coverage %, orphan count, theme overlap.
121
+ 4. Suggest whether any orphan codes warrant a new theme or should be discarded.
122
+ 5. Update the review table with coverage statistics per theme.
123
+
124
+ β›” STOP GATE 4: After reporting saturation, STOP. Tell the user:
125
+ "Phase 4 complete. Coverage is [X]%. Please review the saturation report.
126
+ Adjust theme groupings in the Review Table if needed. Click 'Submit Review'
127
+ to confirm final themes."
128
+ DO NOT proceed until Submit Review is clicked.
129
+
130
+ ──────────────────────────────────────────────────────────────────
131
+ PHASE 5 β€” Defining and Naming Themes
132
+ ──────────────────────────────────────────────────────────────────
133
+ Steps:
134
+ 1. For each confirmed theme, generate: a definitive name, a 2-sentence definition,
135
+ and 3 exemplary quotes from the data.
136
+ 2. Explain how the name captures the essence of the theme.
137
+ 3. Ensure theme names are analytic (not merely descriptive).
138
+ 4. Present the finalised theme map.
139
+
140
+ β›” STOP GATE 5 (implicit): Present the final theme map and ask:
141
+ "Phase 5 complete. Please confirm the final theme names and definitions above.
142
+ When satisfied, type 'proceed to PAJAIS mapping'."
143
+
144
+ ──────────────────────────────────────────────────────────────────
145
+ PHASE 5.5 β€” PAJAIS Taxonomy Mapping
146
+ ──────────────────────────────────────────────────────────────────
147
+ Steps:
148
+ 1. Call compare_with_taxonomy to map each theme to PAJAIS 25 categories.
149
+ 2. Present a mapping table: Theme β†’ PAJAIS Category, Confidence, Rationale.
150
+ 3. Highlight any themes that map to multiple categories (ambiguous cases).
151
+
152
+ β›” STOP GATE 5.5: After presenting the mapping, STOP. Tell the user:
153
+ "PAJAIS mapping complete. Please review the taxonomy mappings in the Review Table.
154
+ Adjust any incorrect mappings. Click 'Submit Review' to confirm."
155
+ DO NOT proceed until Submit Review is clicked.
156
+
157
+ ──────────────────────────────────────────────────────────────────
158
+ PHASE 6 β€” Producing the Report
159
+ ──────────────────────────────────────────────────────────────────
160
+ Steps:
161
+ 1. Call generate_comparison_csv to produce the abstract vs title comparison.
162
+ 2. Call export_narrative to generate the ~500-word Section 7 discussion.
163
+ 3. Present the narrative inline and confirm all files are ready for download.
164
+ 4. List all downloadable outputs: comparison CSV, narrative.md, topics.json,
165
+ themes.json, taxonomy_mapping.json, charts.
166
+ 5. Congratulate the researcher and summarise the full analysis pipeline.
167
+
168
+ No STOP GATE β€” Phase 6 is the final deliverable.
169
+
170
+ ═══════════════════════════════════════════════════════════════════
171
+ OUTPUT FORMAT GUIDELINES
172
+ ═══════════════════════════════════════════════════════════════════
173
+ - Always start your response with: **Phase X β€” [Phase Name]** and the progress %.
174
+ - Use markdown tables for review tables.
175
+ - Use code blocks for JSON snippets.
176
+ - End every non-final phase with a clearly marked β›” STOP message.
177
+ - When referencing tool outputs, always show the key numbers (papers, sentences, clusters).
178
+ """
179
+
180
+ # ---------------------------------------------------------------------------
181
+ # Agent construction
182
+ # ---------------------------------------------------------------------------
183
+
184
+ _llm = ChatMistralAI(model="mistral-large-latest", temperature=0)
185
+
186
+ _tools = [
187
+ load_scopus_csv,
188
+ run_bertopic_discovery,
189
+ label_topics_with_llm,
190
+ consolidate_into_themes,
191
+ compare_with_taxonomy,
192
+ generate_comparison_csv,
193
+ export_narrative,
194
+ ]
195
+
196
+ _memory = MemorySaver()
197
+
198
+ agent = create_react_agent(
199
+ model=_llm,
200
+ tools=_tools,
201
+ checkpointer=_memory,
202
+ prompt=SYSTEM_PROMPT,
203
+ )
204
+
205
+ __all__ = ["agent", "SYSTEM_PROMPT"]
app.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py β€” Gradio Blocks UI for the BERTopic Thematic Analysis Agent.
3
+ Sections: (1) Data Input, (2) Agent Conversation, (3) Results
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import uuid
10
+ from pathlib import Path
11
+ import os
12
+
13
+ import gradio as gr
14
+ import pandas as pd
15
+ import plotly.io as pio
16
+
17
+ from agent import agent
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Constants
21
+ # ---------------------------------------------------------------------------
22
+
23
+ THREAD_ID = str(uuid.uuid4())
24
+ AGENT_CONFIG = {
25
+ "configurable": {"thread_id": THREAD_ID},
26
+ "recursion_limit": 100,
27
+ }
28
+
29
+ REVIEW_COLUMNS = [
30
+ "#",
31
+ "Topic Label",
32
+ "Top Evidence",
33
+ "Sentences",
34
+ "Papers",
35
+ "Approve",
36
+ "Rename To",
37
+ "Reasoning",
38
+ ]
39
+
40
+ PHASE_LABELS = [
41
+ ("Phase 1", "Familiarisation"),
42
+ ("Phase 2", "Initial Codes"),
43
+ ("Phase 3", "Themes"),
44
+ ("Phase 4", "Saturation"),
45
+ ("Phase 5", "Naming"),
46
+ ("Phase 5.5", "PAJAIS"),
47
+ ("Phase 6", "Report"),
48
+ ]
49
+
50
+ CHART_OPTIONS = [
51
+ "Bar β€” Top 20 Topics",
52
+ "Treemap β€” Topic Distribution",
53
+ "Scatter β€” Cluster PCA",
54
+ "Heatmap β€” Topic Similarity",
55
+ ]
56
+
57
+ _CHART_KEYS = ["bar_top20", "treemap", "scatter_pca", "heatmap"]
58
+
59
+ # ---------------------------------------------------------------------------
60
+ # Helpers
61
+ # ---------------------------------------------------------------------------
62
+
63
+ def _phase_bar_html(active_index: int) -> str:
64
+ steps_html = ""
65
+ for i, (code, name) in enumerate(PHASE_LABELS):
66
+ if i < active_index:
67
+ state, bg, fg = "done", "#10b981", "#ffffff"
68
+ elif i == active_index:
69
+ state, bg, fg = "active", "#6366f1", "#ffffff"
70
+ else:
71
+ state, bg, fg = "pending", "#e5e7eb", "#6b7280"
72
+ steps_html += (
73
+ f'<div style="display:flex;flex-direction:column;align-items:center;gap:4px;flex:1;">'
74
+ f'<div style="width:32px;height:32px;border-radius:50%;background:{bg};'
75
+ f'color:{fg};display:flex;align-items:center;justify-content:center;'
76
+ f'font-size:11px;font-weight:600;">{i+1}</div>'
77
+ f'<span style="font-size:10px;color:#374151;text-align:center;line-height:1.2;">'
78
+ f'{code}<br>{name}</span>'
79
+ f'</div>'
80
+ )
81
+ if i < len(PHASE_LABELS) - 1:
82
+ line_bg = "#10b981" if i < active_index else "#e5e7eb"
83
+ steps_html += (
84
+ f'<div style="flex:1;height:2px;background:{line_bg};margin-top:16px;'
85
+ f'max-width:40px;"></div>'
86
+ )
87
+ return (
88
+ f'<div style="padding:16px 8px;background:#f9fafb;border-radius:12px;'
89
+ f'border:1px solid #e5e7eb;margin-bottom:8px;">'
90
+ f'<div style="display:flex;align-items:flex-start;justify-content:space-between;">'
91
+ f'{steps_html}</div></div>'
92
+ )
93
+
94
+
95
+ def _empty_review_df() -> pd.DataFrame:
96
+ return pd.DataFrame(columns=REVIEW_COLUMNS)
97
+
98
+
99
+ def _load_charts() -> dict:
100
+ p = Path("charts.json")
101
+ return json.loads(p.read_text()) if p.exists() else {}
102
+
103
+
104
+ def _call_agent(message: str, history: list):
105
+ result = agent.invoke(
106
+ {"messages": [{"role": "user", "content": message}]},
107
+ config=AGENT_CONFIG,
108
+ )
109
+
110
+ ai_msg = result["messages"][-1].content
111
+
112
+ updated_history = history + [
113
+ {"role": "user", "content": message},
114
+ {"role": "assistant", "content": ai_msg},
115
+ ]
116
+
117
+ return updated_history, ""
118
+
119
+
120
+ def _submit_review(
121
+ review_df: pd.DataFrame,
122
+ history: list,
123
+ ) -> tuple[list, str, pd.DataFrame]:
124
+ """Read table edits, serialise to JSON, send to agent."""
125
+
126
+ approved = review_df[
127
+ review_df["Approve"].astype(str).str.lower() == "yes"
128
+ ] if not review_df.empty else review_df
129
+
130
+ groups = {}
131
+
132
+ for _, row in approved.iterrows():
133
+ theme_name = str(
134
+ row.get("Rename To")
135
+ or row.get("Topic Label")
136
+ or f"Theme_{row['#']}"
137
+ )
138
+ topic_id = int(row["#"]) if str(row["#"]).isdigit() else 0
139
+ groups.setdefault(theme_name, []).append(topic_id)
140
+
141
+ groups_list = [
142
+ {"theme_name": k, "topic_ids": v}
143
+ for k, v in groups.items()
144
+ ]
145
+
146
+ summary = (
147
+ f"Review submitted. Approved topics: {len(approved)}.\n"
148
+ f"Groups formed: {len(groups_list)}.\n\n"
149
+ f"{json.dumps(groups_list, indent=2)}\n\n"
150
+ f"Please consolidate these groups into themes."
151
+ )
152
+
153
+ updated_history, _ = _call_agent(summary, history)
154
+ refreshed = _refresh_review_table()
155
+ return updated_history, "", refreshed
156
+
157
+
158
+ def _upload_csv(file_obj):
159
+ if file_obj is None:
160
+ return "", "No file uploaded."
161
+
162
+ # πŸ”₯ CLEAR OLD FILES
163
+ files_to_clear = [
164
+ "labelled_topics.json",
165
+ "summaries.json",
166
+ "taxonomy_mapping.json",
167
+ "comparison.csv",
168
+ "report.txt"
169
+ ]
170
+
171
+ list(map(lambda f: os.remove(f) if os.path.exists(f) else None, files_to_clear))
172
+
173
+ path = file_obj.name
174
+ return path, f"βœ… File ready: `{path}`"
175
+
176
+
177
+ def _start_analysis(csv_path: str, history: list) -> tuple[list, str, str, pd.DataFrame]:
178
+ if not csv_path:
179
+ return history, "", "⚠️ Please upload a CSV first.", _empty_review_df()
180
+ msg = (
181
+ f"I have uploaded a Scopus CSV at: {csv_path}\n"
182
+ f"Please begin Phase 1 β€” Familiarisation. Load the CSV, report statistics, "
183
+ f"and STOP after Phase 1."
184
+ )
185
+ updated_history, _ = _call_agent(msg, history)
186
+ phase_html = _phase_bar_html(0)
187
+ return updated_history, "", phase_html, _empty_review_df()
188
+
189
+
190
+ def _send_message(user_msg: str, history: list, phase_html: str) -> tuple[list, str, str, pd.DataFrame]:
191
+ if not user_msg.strip():
192
+ return history, "", phase_html, _refresh_review_table()
193
+ updated_history, _ = _call_agent(user_msg, history)
194
+ last_ai = updated_history[-1]["content"] if updated_history else ""
195
+ new_phase = _detect_phase(last_ai, phase_html)
196
+ refreshed = _refresh_review_table()
197
+ return updated_history, "", new_phase, refreshed
198
+
199
+
200
+ def _detect_phase(ai_text: str, current_html: str) -> str:
201
+ phase_map = {
202
+ "phase 1": 0, "phase 2": 1, "phase 3": 2,
203
+ "phase 4": 3, "phase 5.5": 5, "phase 5": 4, "phase 6": 6,
204
+ }
205
+ lower = ai_text.lower()
206
+ detected = current_html
207
+ for key, idx in sorted(phase_map.items(), key=lambda x: -len(x[0])):
208
+ if f"{key} complete" in lower or f"beginning {key}" in lower or f"starting {key}" in lower:
209
+ detected = _phase_bar_html(idx)
210
+ break
211
+ return detected
212
+
213
+
214
+ def _get_chart_plot(chart_name: str):
215
+ charts = _load_charts()
216
+ key_map = dict(zip(CHART_OPTIONS, _CHART_KEYS))
217
+ key = key_map.get(chart_name, "")
218
+ payload = charts.get(key, "")
219
+ if not payload or str(payload).lstrip().startswith("<"):
220
+ return None
221
+ return pio.from_json(payload)
222
+
223
+
224
+ def _get_download_files() -> list[str]:
225
+ candidates = [
226
+ "comparison_abstract_vs_title.csv",
227
+ "narrative.md",
228
+ "topics.json",
229
+ "labelled_topics.json",
230
+ "themes.json",
231
+ "taxonomy_mapping.json",
232
+ "summaries.json",
233
+ ]
234
+ return list(filter(lambda p: Path(p).exists(), candidates))
235
+
236
+
237
+ def _refresh_review_table() -> pd.DataFrame:
238
+ themes_path = Path("themes.json")
239
+ if themes_path.exists():
240
+ themes = json.loads(themes_path.read_text())
241
+ rows = list(map(
242
+ lambda idx_theme: {
243
+ "#": idx_theme[0] + 1,
244
+ "Topic Label": idx_theme[1].get("theme_name", f"Theme {idx_theme[0] + 1}"),
245
+ "Top Evidence": " | ".join(idx_theme[1].get("top_sentences", [])[:2]),
246
+ "Sentences": len(idx_theme[1].get("top_sentences", [])),
247
+ "Papers": "",
248
+ "Approve": "Yes",
249
+ "Rename To": "",
250
+ "Reasoning": "",
251
+ },
252
+ list(enumerate(themes)),
253
+ ))
254
+ return pd.DataFrame(rows)
255
+
256
+ topics_path = Path("labelled_topics.json")
257
+ if not topics_path.exists():
258
+ return _empty_review_df()
259
+ topics = json.loads(topics_path.read_text())
260
+ rows = list(map(
261
+ lambda t: {
262
+ "#": t["topic_id"],
263
+ "Topic Label": t.get("label", f"Topic {t['topic_id']}"),
264
+ "Top Evidence": " | ".join(t.get("top_sentences", [])[:2]),
265
+ "Sentences": t.get("sentence_count", 0),
266
+ "Papers": "",
267
+ "Approve": "Yes",
268
+ "Rename To": "",
269
+ "Reasoning": t.get("reasoning", ""),
270
+ },
271
+ topics[:100],
272
+ ))
273
+ return pd.DataFrame(rows)
274
+
275
+
276
+ def _refresh_downloads() -> list[str]:
277
+ return _get_download_files() or None
278
+
279
+
280
+ # ---------------------------------------------------------------------------
281
+ # Build UI
282
+ # ---------------------------------------------------------------------------
283
+
284
+ with gr.Blocks(
285
+ title="BERTopic Thematic Analysis Agent",
286
+ ) as demo:
287
+
288
+ # ---- State ----
289
+ csv_path_state = gr.State("")
290
+
291
+ # ---- Header ----
292
+ gr.HTML(
293
+ '<div style="padding:24px 0 8px;">'
294
+ '<h1 style="font-size:1.6rem;font-weight:600;margin:0;color:#1e1b4b;">'
295
+ 'πŸ“š BERTopic Thematic Analysis Agent</h1>'
296
+ '<p style="color:#6b7280;margin:4px 0 0;font-size:0.95rem;">'
297
+ 'Braun &amp; Clarke (2006) Β· Six-Phase Pipeline Β· PAJAIS Taxonomy</p>'
298
+ '</div>'
299
+ )
300
+
301
+ # ---- Phase Progress Bar ----
302
+ phase_bar = gr.HTML(value=_phase_bar_html(-1), label="Phase Progress")
303
+
304
+ # ════════════════════════════════════════════════════════
305
+ # SECTION 1 β€” Data Input
306
+ # ════════════════════════════════════════════════════════
307
+ with gr.Group():
308
+ gr.Markdown("## 1 Β· Data Input")
309
+ with gr.Row():
310
+ with gr.Column(scale=2):
311
+ file_upload = gr.File(
312
+ label="Upload Scopus CSV",
313
+ file_types=[".csv"],
314
+ type="filepath",
315
+ )
316
+ file_status = gr.Markdown("_No file uploaded._")
317
+ with gr.Column(scale=1):
318
+ run_config = gr.Radio(
319
+ choices=["abstract", "title"],
320
+ value="abstract",
321
+ label="Run Config (field to cluster)",
322
+ )
323
+ start_btn = gr.Button("β–Ά Start Analysis", variant="primary", size="lg")
324
+
325
+ # ════════════════════════════════════════════════════════
326
+ # SECTION 2 β€” Agent Conversation
327
+ # ════════════════════════════════════════════════════════
328
+ with gr.Group():
329
+ gr.Markdown("## 2 Β· Agent Conversation")
330
+ chatbot = gr.Chatbot(
331
+ label="Thematic Analysis Agent"
332
+ )
333
+ with gr.Row():
334
+ chat_input = gr.Textbox(
335
+ placeholder="Type a message or instruction… (e.g. 'proceed to Phase 2')",
336
+ label="",
337
+ scale=5,
338
+ show_label=False,
339
+ lines=1,
340
+ )
341
+ send_btn = gr.Button("Send", variant="primary", scale=1)
342
+
343
+ # ════════════════════════════════════════════════════════
344
+ # SECTION 3 β€” Results
345
+ # ════════════════════════════════════════════════════════
346
+ with gr.Group():
347
+ gr.Markdown("## 3 Β· Results")
348
+ with gr.Tabs():
349
+
350
+ # --- Tab 1: Review Table ---
351
+ with gr.TabItem("πŸ“‹ Review Table"):
352
+ with gr.Row():
353
+ refresh_table_btn = gr.Button("πŸ”„ Refresh Table", size="sm")
354
+ review_table = gr.Dataframe(
355
+ value=_empty_review_df(),
356
+ headers=REVIEW_COLUMNS,
357
+ datatype=[
358
+ "number", "str", "str", "number",
359
+ "str", "str", "str", "str",
360
+ ],
361
+ column_count=(8, "fixed"),
362
+ interactive=True,
363
+ wrap=True,
364
+ label="Topic Review Table (edit Approve / Rename To / Reasoning)"
365
+ )
366
+ submit_review_btn = gr.Button(
367
+ "βœ… Submit Review", variant="primary", size="lg"
368
+ )
369
+
370
+ # --- Tab 2: Charts ---
371
+ with gr.TabItem("πŸ“Š Charts"):
372
+ chart_dropdown = gr.Dropdown(
373
+ choices=CHART_OPTIONS,
374
+ value=CHART_OPTIONS[0],
375
+ label="Select Chart",
376
+ interactive=True,
377
+ )
378
+ chart_display = gr.Plot(label="Chart")
379
+
380
+ # --- Tab 3: Download ---
381
+ with gr.TabItem("⬇ Download"):
382
+ refresh_dl_btn = gr.Button("πŸ”„ Refresh Files", size="sm")
383
+ download_files = gr.File(
384
+ label="Download Analysis Outputs",
385
+ file_count="multiple",
386
+ interactive=False,
387
+ value=None,
388
+ )
389
+
390
+ # ════════════════════════════════════════════════════════
391
+ # Event wiring
392
+ # ════════════════════════════════════════════════════════
393
+
394
+ # Upload CSV β†’ store path
395
+ file_upload.change(
396
+ fn=_upload_csv,
397
+ inputs=[file_upload],
398
+ outputs=[csv_path_state, file_status],
399
+ )
400
+
401
+ # Start analysis button
402
+ start_btn.click(
403
+ fn=_start_analysis,
404
+ inputs=[csv_path_state, chatbot],
405
+ outputs=[chatbot, chat_input, phase_bar, review_table],
406
+ )
407
+
408
+ # Send message (button)
409
+ send_btn.click(
410
+ fn=_send_message,
411
+ inputs=[chat_input, chatbot, phase_bar],
412
+ outputs=[chatbot, chat_input, phase_bar, review_table],
413
+ )
414
+
415
+ # Send message (Enter key)
416
+ chat_input.submit(
417
+ fn=_send_message,
418
+ inputs=[chat_input, chatbot, phase_bar],
419
+ outputs=[chatbot, chat_input, phase_bar, review_table],
420
+ )
421
+
422
+ # Submit review table
423
+ submit_review_btn.click(
424
+ fn=_submit_review,
425
+ inputs=[review_table, chatbot],
426
+ outputs=[chatbot, chat_input, review_table],
427
+ )
428
+
429
+ # Refresh review table
430
+ refresh_table_btn.click(
431
+ fn=_refresh_review_table,
432
+ inputs=[],
433
+ outputs=[review_table],
434
+ )
435
+
436
+ # Chart dropdown
437
+ chart_dropdown.change(
438
+ fn=_get_chart_plot,
439
+ inputs=[chart_dropdown],
440
+ outputs=[chart_display],
441
+ )
442
+
443
+ # Refresh downloads
444
+ refresh_dl_btn.click(
445
+ fn=_refresh_downloads,
446
+ inputs=[],
447
+ outputs=[download_files],
448
+ )
449
+
450
+
451
+ # ---------------------------------------------------------------------------
452
+ # Launch
453
+ # ---------------------------------------------------------------------------
454
+
455
+ if __name__ == "__main__":
456
+ demo.launch(
457
+ server_name="0.0.0.0",
458
+ server_port=7860,
459
+ show_error=True,
460
+ theme=gr.themes.Soft(primary_hue="indigo"),
461
+ )
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core ML / NLP
2
+ sentence-transformers==3.3.1
3
+ scikit-learn==1.6.1
4
+ numpy==1.26.4
5
+
6
+ # LangChain / LangGraph
7
+ langchain==0.3.18
8
+ langchain-core==0.3.37
9
+ langchain-mistralai==0.2.4
10
+ langgraph==0.2.73
11
+
12
+ # Gradio UI
13
+ gradio==5.16.0
14
+
15
+ # Data handling
16
+ pandas==2.2.3
17
+ pyarrow==19.0.0
18
+
19
+ # Visualisation
20
+ plotly==5.24.1
21
+
22
+ # Mistral SDK (pulled by langchain-mistralai, pinned for stability)
23
+ mistralai==1.3.1
24
+
25
+ # Utilities
26
+ python-dotenv==1.0.1
tools.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ tools.py β€” 7 LangChain tool functions for BERTopic thematic analysis pipeline.
3
+ Constraints: ZERO if/else, ZERO for/while, ZERO try/except.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import re
10
+ import numpy as np
11
+ import pandas as pd
12
+ import plotly.express as px
13
+ import plotly.graph_objects as go
14
+
15
+ from pathlib import Path
16
+ from langchain_core.tools import tool
17
+ from sentence_transformers import SentenceTransformer
18
+ from sklearn.cluster import AgglomerativeClustering
19
+ from sklearn.metrics.pairwise import cosine_similarity
20
+ from langchain_core.prompts import PromptTemplate
21
+ from langchain_core.output_parsers import JsonOutputParser
22
+ from langchain_mistralai import ChatMistralAI
23
+ from dotenv import load_dotenv
24
+ load_dotenv() # add this right after the imports
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Constants
28
+ # ---------------------------------------------------------------------------
29
+
30
+ BOILERPLATE_PATTERNS = [
31
+ r"Β©\s*\d{4}",
32
+ r"all rights reserved",
33
+ r"published by elsevier",
34
+ r"doi:\s*10\.\S+",
35
+ r"this article is protected",
36
+ r"www\.\S+\.com",
37
+ r"^\s*abstract\s*$",
38
+ r"please cite this article",
39
+ r"accepted manuscript",
40
+ ]
41
+
42
+ RUN_CONFIGS = {
43
+ "abstract": ["Abstract"],
44
+ "title": ["Title"],
45
+ }
46
+
47
+ PAJAIS_CATEGORIES = [
48
+ "Artificial Intelligence", "Machine Learning", "Deep Learning",
49
+ "Natural Language Processing", "Computer Vision", "Robotics",
50
+ "Knowledge Representation", "Expert Systems", "Decision Support",
51
+ "Data Mining", "Information Retrieval", "Human-Computer Interaction",
52
+ "Ethics in AI", "Explainable AI", "Fairness and Bias",
53
+ "AI in Healthcare", "AI in Education", "AI in Finance",
54
+ "AI in Manufacturing", "AI in Agriculture", "AI Governance",
55
+ "Neural Networks", "Reinforcement Learning", "Federated Learning",
56
+ "AI Safety",
57
+ ]
58
+
59
+ _MISTRAL = ChatMistralAI(model="mistral-large-latest", temperature=0)
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Helper β€” pure functions, no loops
63
+ # ---------------------------------------------------------------------------
64
+
65
+ def _clean_text(text: str) -> str:
66
+ combined = "|".join(BOILERPLATE_PATTERNS)
67
+ return re.sub(combined, "", text, flags=re.IGNORECASE).strip()
68
+
69
+
70
+ def _sentences_from_series(series: pd.Series) -> list[str]:
71
+ raw = series.dropna().str.cat(sep=" ")
72
+ return list(filter(None, map(str.strip, re.split(r"(?<=[.!?])\s+", raw))))
73
+
74
+
75
+ def _nearest_centroids(embeddings: np.ndarray, labels: np.ndarray, n: int = 5):
76
+ unique_labels = np.unique(labels)
77
+ centroids = np.array(list(map(
78
+ lambda lbl: embeddings[labels == lbl].mean(axis=0),
79
+ unique_labels,
80
+ )))
81
+ sim_matrix = cosine_similarity(centroids)
82
+ np.fill_diagonal(sim_matrix, -1)
83
+ nearest = list(map(
84
+ lambda i: unique_labels[np.argsort(sim_matrix[i])[::-1][:n]].tolist(),
85
+ range(len(unique_labels)),
86
+ ))
87
+ return dict(zip(unique_labels.tolist(), nearest))
88
+
89
+
90
+ def _top_sentences(sentences: list[str], embeddings: np.ndarray,
91
+ centroid: np.ndarray, k: int = 5) -> list[str]:
92
+ sims = cosine_similarity([centroid], embeddings)[0]
93
+ top_idx = np.argsort(sims)[::-1][:k]
94
+ return list(map(lambda i: sentences[i], top_idx))
95
+
96
+
97
+ # ---------------------------------------------------------------------------
98
+ # Tool 1 β€” load_scopus_csv
99
+ # ---------------------------------------------------------------------------
100
+
101
+ @tool
102
+ def load_scopus_csv(csv_path: str, run_config: str = "abstract") -> str:
103
+ """Load a Scopus CSV file, count papers/sentences, apply boilerplate regex
104
+ filter, and return a JSON summary. run_config must be 'abstract' or 'title'."""
105
+ df = pd.read_csv(csv_path)
106
+ columns = RUN_CONFIGS[run_config]
107
+ available_cols = list(filter(lambda c: c in df.columns, columns))
108
+ texts = df[available_cols].fillna("").apply(
109
+ lambda row: " ".join(row.values.astype(str)), axis=1
110
+ )
111
+ import re
112
+
113
+ # Step 1: basic cleaning
114
+ cleaned = list(map(_clean_text, texts))
115
+
116
+ # Step 2: πŸ”₯ remove boilerplate noise (ADD HERE)
117
+ cleaned = list(map(
118
+ lambda x: re.sub(
119
+ r"Β©.*|all rights reserved|copyright.*|palgrave.*",
120
+ "",
121
+ x,
122
+ flags=re.I
123
+ ),
124
+ cleaned
125
+ ))
126
+ sentences = _sentences_from_series(pd.Series(cleaned))
127
+ df["_cleaned_text"] = cleaned
128
+ df.to_parquet(csv_path.replace(".csv", "_cleaned.parquet"), index=False)
129
+ summary = {
130
+ "csv_path": csv_path,
131
+ "run_config": run_config,
132
+ "columns_used": available_cols,
133
+ "total_papers": int(len(df)),
134
+ "total_sentences": len(sentences),
135
+ "sample_titles": df["Title"].head(5).tolist() if "Title" in df.columns else [],
136
+ }
137
+ Path("summaries.json").write_text(json.dumps(summary, indent=2))
138
+ return json.dumps(summary)
139
+
140
+
141
+ # ---------------------------------------------------------------------------
142
+ # Tool 2 β€” run_bertopic_discovery
143
+ # ---------------------------------------------------------------------------
144
+
145
+ @tool
146
+ def run_bertopic_discovery(parquet_path: str, run_config: str = "abstract") -> str:
147
+ """Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering
148
+ (cosine, threshold=0.7), find 5 nearest centroids per cluster, generate 4
149
+ Plotly charts. Saves summaries.json + emb.npy. Returns topic summaries JSON."""
150
+ df = pd.read_parquet(parquet_path)
151
+ columns = RUN_CONFIGS[run_config]
152
+ available_cols = list(filter(lambda c: c in df.columns, columns))
153
+ texts = df[available_cols].fillna("").apply(
154
+ lambda row: " ".join(row.values.astype(str)), axis=1
155
+ )
156
+ sentences = _sentences_from_series(texts)
157
+
158
+ model = SentenceTransformer("all-MiniLM-L6-v2")
159
+ embeddings = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)
160
+ np.save("emb.npy", embeddings)
161
+
162
+ clustering = AgglomerativeClustering(
163
+ metric="cosine",
164
+ linkage="average",
165
+ distance_threshold=0.7,
166
+ n_clusters=None,
167
+ )
168
+ labels = clustering.fit_predict(embeddings)
169
+
170
+ unique_labels, counts = np.unique(labels, return_counts=True)
171
+ nearest = _nearest_centroids(embeddings, labels)
172
+
173
+ topic_summaries = list(map(
174
+ lambda pair: {
175
+ "topic_id": int(pair[0]),
176
+ "sentence_count": int(pair[1]),
177
+ "nearest_topics": nearest.get(int(pair[0]), []),
178
+ "top_sentences": _top_sentences(
179
+ sentences, embeddings,
180
+ embeddings[labels == pair[0]].mean(axis=0),
181
+ ),
182
+ },
183
+ zip(unique_labels, counts),
184
+ ))
185
+
186
+ # Sort by sentence count desc
187
+ topic_summaries.sort(key=lambda t: t["sentence_count"], reverse=True)
188
+ top100 = topic_summaries[:100]
189
+
190
+ # ---- Chart 1: Bar chart β€” top 20 topics by sentence count ----
191
+ top20 = top100[:20]
192
+ fig1 = px.bar(
193
+ x=[f"T{t['topic_id']}" for t in top20],
194
+ y=[t["sentence_count"] for t in top20],
195
+ labels={"x": "Topic", "y": "Sentences"},
196
+ title="Top 20 Topics by Sentence Count",
197
+ )
198
+
199
+ # ---- Chart 2: Treemap ----
200
+ fig2 = px.treemap(
201
+ names=[f"Topic {t['topic_id']}" for t in top100],
202
+ parents=["All"] * len(top100),
203
+ values=[t["sentence_count"] for t in top100],
204
+ title="Topic Distribution Treemap",
205
+ )
206
+
207
+ # ---- Chart 3: Scatter (PCA 2D projection) ----
208
+ from sklearn.decomposition import PCA
209
+ pca = PCA(n_components=2)
210
+ coords = pca.fit_transform(embeddings)
211
+ fig3 = go.Figure(go.Scatter(
212
+ x=coords[:, 0], y=coords[:, 1],
213
+ mode="markers",
214
+ marker=dict(color=labels, colorscale="Viridis", size=4, opacity=0.6),
215
+ ))
216
+ fig3.update_layout(title="Sentence Clusters (PCA 2D)")
217
+
218
+ # ---- Chart 4: Heatmap β€” top 10 topic cosine similarity ----
219
+ top10_ids = [t["topic_id"] for t in top100[:10]]
220
+ centroids10 = np.array(list(map(
221
+ lambda lbl: embeddings[labels == lbl].mean(axis=0),
222
+ top10_ids,
223
+ )))
224
+ sim10 = cosine_similarity(centroids10)
225
+ fig4 = px.imshow(
226
+ sim10,
227
+ x=[f"T{i}" for i in top10_ids],
228
+ y=[f"T{i}" for i in top10_ids],
229
+ color_continuous_scale="Blues",
230
+ title="Top-10 Topic Cosine Similarity Heatmap",
231
+ )
232
+
233
+ charts = {
234
+ "bar_top20": fig1.to_json(),
235
+ "treemap": fig2.to_json(),
236
+ "scatter_pca": fig3.to_json(),
237
+ "heatmap": fig4.to_json(),
238
+ }
239
+
240
+ result = {
241
+ "total_clusters": int(len(unique_labels)),
242
+ "top100_topics": top100,
243
+ "charts_html": charts,
244
+ }
245
+
246
+ existing = json.loads(Path("summaries.json").read_text())
247
+ existing.update({"bertopic": {"total_clusters": result["total_clusters"]}})
248
+ Path("summaries.json").write_text(json.dumps(existing, indent=2))
249
+ Path("charts.json").write_text(json.dumps(charts, indent=2))
250
+ Path("topics.json").write_text(json.dumps(top100, indent=2))
251
+
252
+ return json.dumps({
253
+ "total_clusters": result["total_clusters"],
254
+ "top100_count": len(top100),
255
+ "charts_saved": list(charts.keys()),
256
+ })
257
+
258
+
259
+ # ---------------------------------------------------------------------------
260
+ # Tool 3 β€” label_topics_with_llm
261
+ # ---------------------------------------------------------------------------
262
+
263
+ @tool
264
+ def label_topics_with_llm(topics_json_path: str = "topics.json") -> str:
265
+ """Send top-100 topics to Mistral via PromptTemplate + JsonOutputParser to
266
+ generate human-readable labels. Returns labelled topics JSON."""
267
+ topics = json.loads(Path(topics_json_path).read_text())
268
+ batch = topics[:100]
269
+
270
+ prompt = PromptTemplate.from_template(
271
+ "You are a qualitative research expert. Below are topic clusters from a "
272
+ "systematic literature review. For EACH topic assign a concise label "
273
+ "(3-6 words) and one sentence of reasoning.\n\n"
274
+ "Topics:\n{topics_text}\n\n"
275
+ "Return ONLY valid JSON: a list of objects with keys: "
276
+ "topic_id, label, reasoning. No markdown fences."
277
+ )
278
+ parser = JsonOutputParser()
279
+ chain = prompt | _MISTRAL | parser
280
+
281
+ topics_text = "\n".join(list(map(
282
+ lambda t: f"Topic {t['topic_id']} ({t['sentence_count']} sentences): "
283
+ + " | ".join(t["top_sentences"][:2]),
284
+ batch,
285
+ )))
286
+
287
+ labelled = chain.invoke({"topics_text": topics_text})
288
+ label_map = {item["topic_id"]: item for item in labelled}
289
+
290
+ enriched = list(map(
291
+ lambda t: {**t, **label_map.get(t["topic_id"], {"label": f"Topic {t['topic_id']}", "reasoning": ""})},
292
+ batch,
293
+ ))
294
+
295
+ Path("labelled_topics.json").write_text(json.dumps(enriched, indent=2))
296
+ return json.dumps({"labelled_count": len(enriched), "path": "labelled_topics.json"})
297
+
298
+
299
+ # ---------------------------------------------------------------------------
300
+ # Tool 4 β€” consolidate_into_themes
301
+ # ---------------------------------------------------------------------------
302
+
303
+ @tool
304
+ def consolidate_into_themes(approved_groups_json: str) -> str:
305
+ """Merge approved topic groups into themes, recompute centroids from emb.npy.
306
+ approved_groups_json: JSON list of {theme_name, topic_ids: [...]} objects."""
307
+ groups = json.loads(approved_groups_json)
308
+ embeddings = np.load("emb.npy")
309
+ topics = json.loads(Path("labelled_topics.json").read_text())
310
+ topic_id_to_sentences = {t["topic_id"]: t["top_sentences"] for t in topics}
311
+
312
+ themes = list(map(
313
+ lambda g: {
314
+ "theme_name": g["theme_name"],
315
+ "topic_ids": g["topic_ids"],
316
+ "top_sentences": sum(
317
+ list(map(lambda tid: topic_id_to_sentences.get(tid, []), g["topic_ids"])),
318
+ [],
319
+ )[:10],
320
+ "centroid": embeddings[
321
+ np.isin(np.arange(len(embeddings)), g["topic_ids"])
322
+ ].mean(axis=0).tolist(),
323
+ },
324
+ groups,
325
+ ))
326
+
327
+ Path("themes.json").write_text(json.dumps(themes, indent=2))
328
+ return json.dumps({"themes_count": len(themes), "theme_names": [t["theme_name"] for t in themes]})
329
+
330
+
331
+ # ---------------------------------------------------------------------------
332
+ # Tool 5 β€” compare_with_taxonomy
333
+ # ---------------------------------------------------------------------------
334
+
335
+ @tool
336
+ def compare_with_taxonomy(themes_json_path: str = "themes.json") -> str:
337
+ """Map consolidated themes to PAJAIS 25 categories via Mistral.
338
+ Returns a mapping JSON."""
339
+ themes = json.loads(Path(themes_json_path).read_text())
340
+
341
+ prompt = PromptTemplate.from_template(
342
+ "You are an AI research taxonomist. Map each theme to the most relevant "
343
+ "PAJAIS category.\n\n"
344
+ "PAJAIS Categories:\n{categories}\n\n"
345
+ "Themes:\n{themes_text}\n\n"
346
+ "Return ONLY valid JSON: a list of objects with keys: "
347
+ "theme_name, pajais_category, confidence (0-1), rationale. No markdown."
348
+ )
349
+ parser = JsonOutputParser()
350
+ chain = prompt | _MISTRAL | parser
351
+
352
+ themes_text = "\n".join(list(map(
353
+ lambda t: f"- {t['theme_name']}: " + "; ".join(t["top_sentences"][:2]),
354
+ themes,
355
+ )))
356
+
357
+ mapping = chain.invoke({
358
+ "categories": "\n".join(list(map(lambda c: f" β€’ {c}", PAJAIS_CATEGORIES))),
359
+ "themes_text": themes_text,
360
+ })
361
+
362
+ Path("taxonomy_mapping.json").write_text(json.dumps(mapping, indent=2))
363
+ return json.dumps({"mapped_count": len(mapping), "path": "taxonomy_mapping.json"})
364
+
365
+
366
+ # ---------------------------------------------------------------------------
367
+ # Tool 6 β€” generate_comparison_csv
368
+ # ---------------------------------------------------------------------------
369
+
370
+ @tool
371
+ def generate_comparison_csv(original_csv_path: str) -> str:
372
+ """Generate a side-by-side comparison CSV of abstract vs title clustering
373
+ results for each paper. Returns path to output CSV."""
374
+ df = pd.read_csv(original_csv_path)
375
+ abstract_col = "Abstract" if "Abstract" in df.columns else None
376
+ title_col = "Title" if "Title" in df.columns else None
377
+
378
+ comparison = df[[c for c in [title_col, abstract_col] if c is not None]].copy()
379
+ comparison.columns = list(map(
380
+ lambda c: c + "_text",
381
+ [c for c in [title_col, abstract_col] if c is not None],
382
+ ))
383
+ comparison.insert(0, "Paper_ID", range(1, len(df) + 1))
384
+
385
+ taxonomy_path = Path("taxonomy_mapping.json")
386
+ theme_label = list(map(
387
+ lambda _: "See themes.json for full mapping",
388
+ range(len(comparison)),
389
+ ))
390
+ comparison["Theme_Assignment"] = theme_label
391
+
392
+ out_path = "comparison_abstract_vs_title.csv"
393
+ comparison.to_csv(out_path, index=False)
394
+ return json.dumps({"output_csv": out_path, "rows": len(comparison), "columns": comparison.columns.tolist()})
395
+
396
+
397
+ # ---------------------------------------------------------------------------
398
+ # Tool 7 β€” export_narrative
399
+ # ---------------------------------------------------------------------------
400
+
401
+ @tool
402
+ def export_narrative(context_json: str = "{}") -> str:
403
+ """Generate a ~500-word Section 7 narrative via Mistral, synthesising all
404
+ prior analysis. context_json may contain extra instructions. Returns the
405
+ narrative text and saves it to narrative.md."""
406
+ context = json.loads(context_json)
407
+ themes = json.loads(Path("themes.json").read_text()) if Path("themes.json").exists() else []
408
+ mapping = json.loads(Path("taxonomy_mapping.json").read_text()) if Path("taxonomy_mapping.json").exists() else []
409
+ summaries = json.loads(Path("summaries.json").read_text()) if Path("summaries.json").exists() else {}
410
+
411
+ themes_summary = "\n".join(list(map(
412
+ lambda t: f"- **{t['theme_name']}**: " + "; ".join(t["top_sentences"][:1]),
413
+ themes,
414
+ )))
415
+ mapping_summary = "\n".join(list(map(
416
+ lambda m: f"- {m.get('theme_name','?')} β†’ {m.get('pajais_category','?')} "
417
+ f"(confidence: {m.get('confidence', '?')})",
418
+ mapping,
419
+ )))
420
+
421
+ prompt = PromptTemplate.from_template(
422
+ "You are a senior academic researcher writing a systematic literature review. "
423
+ "Write Section 7 (Discussion & Synthesis) of approximately 500 words. "
424
+ "Use an academic tone, Braun & Clarke (2006) thematic analysis framing, "
425
+ "and reference the themes and PAJAIS taxonomy mappings provided.\n\n"
426
+ "Dataset summary:\n{summaries}\n\n"
427
+ "Themes identified:\n{themes}\n\n"
428
+ "PAJAIS taxonomy mapping:\n{mapping}\n\n"
429
+ "Extra context: {extra}\n\n"
430
+ "Write the section now. Use markdown headings."
431
+ )
432
+ chain = prompt | _MISTRAL
433
+
434
+ result = chain.invoke({
435
+ "summaries": json.dumps(summaries, indent=2),
436
+ "themes": themes_summary,
437
+ "mapping": mapping_summary,
438
+ "extra": context.get("extra_instructions", "None"),
439
+ })
440
+
441
+ narrative = result.content
442
+ Path("narrative.md").write_text(narrative)
443
+ return json.dumps({"narrative_path": "narrative.md", "word_count": len(narrative.split())})