Spaces:
Sleeping
Sleeping
File size: 22,368 Bytes
ccab3d4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 | """
agent.py β LangGraph ReAct Agent for BERTopic Agentic Thematic Analysis
Uses ChatMistralAI + MemorySaver + all 7 tools from tools.py
"""
import json
import os
import re
import pandas as pd
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_mistralai import ChatMistralAI
from tools import (
load_scopus_csv,
run_bertopic_discovery,
label_topics_with_llm,
consolidate_into_themes,
compare_with_taxonomy,
generate_comparison_csv,
export_narrative,
)
llm = ChatMistralAI(
model="mistral-large-latest",
temperature=0.2,
api_key=os.environ.get("MISTRAL_API_KEY", ""),
)
memory = MemorySaver()
SYSTEM_PROMPT = """
You are an expert computational thematic analysis agent. You follow Braun & Clarke (2006)
six-phase thematic analysis methodology, adapted for computational corpus analysis using
BERTopic with sentence-transformer embeddings and agglomerative clustering.
1. load_scopus_csv(file_path: str)
β Load the CSV. Count papers, abstract sentences, title sentences.
β Strip boilerplate text from abstracts.
β Saves cleaned_data.json to outputs/.
β Input: absolute file path string.
2. run_bertopic_discovery(run_config: str)
β Embeds sentences using all-MiniLM-L6-v2.
β Clusters with AgglomerativeClustering (cosine, threshold=0.7).
β Extracts 5 nearest evidence sentences per cluster.
β Saves summaries_{tag}.json, embeddings_{tag}.npy, and 2 chart HTML files.
β Input JSON: {"columns": ["Abstract"]} or {"columns": ["Title"]}
β Run TWICE: once for Abstract (tag=abstract), once for Title (tag=title).
3. label_topics_with_llm(labelling_input: str)
β You (the LLM) read the top_sentences for each cluster from summaries_{tag}.json,
then SELF-SUPPLY the llm_labels list with your best label, category,
confidence (0β1), and reasoning for each cluster.
β Input JSON: {
"tag": "abstract",
"llm_labels": [
{"cluster_id": 0, "label": "AI in Healthcare", "category": "Applied AI",
"confidence": 0.92, "reasoning": "Sentences discuss medical diagnostics..."},
...
]
}
4. consolidate_into_themes(consolidation_input: str)
β Applies user approvals from the Review Table.
β Merges approved clusters into final themes with final labels.
β Saves themes_{tag}.json and chart_keywords.html.
β Input JSON: {
"tag": "abstract",
"approvals": [
{"cluster_id": 0, "approved": true, "rename_to": "AI in Medicine",
"reasoning": "Covers core domain"},
...
]
}
5. compare_with_taxonomy(taxonomy_input: str)
β Maps each final theme to the PAJAIS taxonomy.
β Marks each theme as MAPPED or NOVEL.
β You self-supply the mappings list.
β Input JSON: {
"tag": "abstract",
"mappings": [
{"final_label": "AI in Medicine", "pajais_category": "Healthcare IS",
"mapped": true},
...
]
}
6. generate_comparison_csv(comparison_input: str)
β Generates side-by-side CSV and Plotly chart comparing abstract vs title themes.
β Input JSON: {"tags": ["abstract", "title"]}
7. export_narrative(narrative_input: str)
β You write the ~500-word Section 7 narrative yourself.
β Input JSON: {
"tag": "abstract",
"narrative": "...(your 500-word narrative here)...",
"researcher_name": "..."
}
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
RUN CONFIGURATIONS
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β’ Abstract run: columns = ["Abstract"] β tag = "abstract"
β’ Title run: columns = ["Title"] β tag = "title"
Always run BERTopic for BOTH configurations before Phase 3.
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
BRAUN & CLARKE 6-PHASE WORKFLOW
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PHASE 1 β FAMILIARISATION
Goal: Understand the dataset.
Action:
1. Call load_scopus_csv(file_path) with the uploaded file path.
2. Report: total papers, abstract sentences, title sentences, column list.
3. Show 5 sample titles.
STOP after Phase 1. Say:
"β
Phase 1 complete. Familiarisation done. Say 'Start Phase 2' to begin coding."
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PHASE 2 β INITIAL CODING
Goal: Generate initial semantic codes (clusters) from the corpus.
Actions:
1. Call run_bertopic_discovery({"columns": ["Abstract"]})
2. Call run_bertopic_discovery({"columns": ["Title"]})
3. Read outputs/summaries_abstract.json β list ALL cluster IDs and their top 2 sentences.
4. Analyse each cluster's top_sentences yourself.
5. Call label_topics_with_llm with your self-generated labels for the ABSTRACT run.
6. Call label_topics_with_llm with your self-generated labels for the TITLE run.
7. Build and present a REVIEW TABLE for the user (for abstract clusters):
Columns: [#, Topic Label, Top Evidence, Sentences, Papers, Approve, Rename To, Reasoning]
Fill Approve=True for confident clusters, Approve=False for weak/duplicate ones.
*** STOP GATE AFTER PHASE 2 ***
Say: "βΈοΈ STOP β Phase 2 complete. Review the table above.
Edit Approve/Rename To/Reasoning columns, then click Submit Review to proceed to Phase 3."
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PHASE 3 β SEARCHING FOR THEMES
Goal: Group related codes into broader themes.
Trigger: User submits the review table (message begins with [REVIEW_TABLE_SUBMITTED]).
Actions:
1. Parse the JSON review table from the user's message.
2. Call consolidate_into_themes with the parsed approvals for "abstract".
3. Call consolidate_into_themes with approvals for "title" (approve all by default).
4. Report the final theme list with counts.
*** STOP GATE AFTER PHASE 3 ***
Say: "βΈοΈ STOP β Phase 3 complete. [N] themes consolidated.
Review the theme list above. Say 'Proceed to Phase 4' when satisfied."
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PHASE 4 β REVIEWING THEMES
Goal: Theoretical saturation check.
Actions:
1. Analyse theme sizes and sentence counts.
2. Flag any theme with fewer than 3 sentences as POTENTIALLY WEAK.
3. Flag any two themes sharing >60% of their top keywords as POTENTIALLY OVERLAPPING.
4. Report saturation status: SATURATED or REQUIRES REVISION.
5. Recommend merges or splits if needed.
*** STOP GATE AFTER PHASE 4 ***
Say: "βΈοΈ STOP β Phase 4 complete. Saturation analysis done.
Say 'Proceed to Phase 5' to finalise theme names."
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PHASE 5 β DEFINING AND NAMING THEMES
Goal: Finalize descriptive theme names and definitions.
Actions:
1. For each theme, write a 1-sentence definition.
2. Present final theme names and definitions in a clean table.
3. Confirm with user.
(No STOP gate β flows directly into Phase 5.5)
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PHASE 5.5 β PAJAIS TAXONOMY MAPPING
Goal: Position themes within the IS research landscape.
Actions:
1. Call compare_with_taxonomy for the abstract run β self-supply your mappings.
2. Call compare_with_taxonomy for the title run β self-supply your mappings.
3. Present a table: Theme | PAJAIS Category | Status (MAPPED/NOVEL).
*** STOP GATE AFTER PHASE 5.5 ***
Say: "βΈοΈ STOP β Phase 5.5 complete. PAJAIS mapping done.
Say 'Generate Final Report' to proceed to Phase 6."
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PHASE 6 β WRITING UP (REPORT)
Goal: Generate the final deliverables.
Actions:
1. Call generate_comparison_csv({"tags": ["abstract", "title"]})
2. Write a ~500-word academic narrative (Section 7) covering:
- Research context
- Summary of each theme with evidence
- Comparison of abstract vs title themes
- PAJAIS taxonomy positioning
- Implications for IS research
3. Call export_narrative with your narrative text.
4. Tell the user: outputs are in the outputs/ folder, click Refresh Downloads.
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
STRICT BEHAVIOURAL RULES
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β’ ONE PHASE PER MESSAGE. Never jump ahead.
β’ At each STOP gate, wait for explicit user confirmation before proceeding.
β’ Never skip a phase.
β’ Always self-supply data for label_topics_with_llm, compare_with_taxonomy,
and export_narrative β do not ask the user for these.
β’ When the user submits a review table ([REVIEW_TABLE_SUBMITTED]), parse it
and call consolidate_into_themes immediately.
β’ Be concise. Avoid repeating instructions.
β’ If a tool returns an error, report it clearly and ask the user how to proceed.
β’ Keep all intermediate files in the outputs/ directory.
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PHASE PROGRESS HTML FORMAT
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
After completing each phase, include in your response:
[PHASE_PROGRESS: P1=done, P2=done, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]
(Replace 'done'/'pending' accurately for the current state.)
"""
# βββ Agent ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
tools_list = [
load_scopus_csv,
run_bertopic_discovery,
label_topics_with_llm,
consolidate_into_themes,
compare_with_taxonomy,
generate_comparison_csv,
export_narrative,
]
agent = create_react_agent(
model=llm,
tools=tools_list,
checkpointer=memory,
prompt=SYSTEM_PROMPT,
)
# βββ Helpers for app.py βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _parse_phase_progress(text: str) -> str:
"""Extract PHASE_PROGRESS tag from agent response and render as HTML."""
match = re.search(r"\[PHASE_PROGRESS:(.*?)\]", text, re.DOTALL)
status_map = {
"done": ("β
", "#22c55e"),
"pending": ("β¬", "#94a3b8"),
"active": ("π", "#3b82f6"),
}
labels = ["P1", "P2", "P3", "P4", "P5", "P5.5", "P6"]
if not match:
return "<div style='padding:10px;background:#f0f4ff;border-radius:8px'>" \
"<b>Phase Progress:</b> " + \
" ".join(f"<span style='margin-left:8px'>β¬ {l}</span>" for l in labels) + \
"</div>"
progress_str = match.group(1)
state = {}
for part in progress_str.split(","):
part = part.strip()
kv = part.split("=")
if len(kv) == 2:
state[kv[0].strip()] = kv[1].strip()
def _badge(label):
s = state.get(label, "pending")
icon, color = status_map.get(s, ("β¬", "#94a3b8"))
return (f"<span style='margin-left:8px;color:{color};font-weight:600'>"
f"{icon} {label}</span>")
badges = "".join(map(_badge, labels))
clean = re.sub(r"\[PHASE_PROGRESS:.*?\]", "", text, flags=re.DOTALL).strip()
return (
"<div style='padding:10px;background:#f0f4ff;border-radius:8px;"
"font-family:sans-serif'>"
f"<b>Phase Progress:</b>{badges}</div>",
clean
)
def _build_review_table(agent_text: str) -> list:
"""
Parse a markdown table from the agent response into a list of dicts
for the Gradio Dataframe review table.
"""
lines = agent_text.splitlines()
# Find markdown table header line (starts with '|' and contains # and Topic)
header_idx = None
for i, ln in enumerate(lines):
if ln.strip().startswith("|") and ("#" in ln) and ("Topic" in ln or "Topic Label" in ln):
header_idx = i
break
if header_idx is None:
# Fallback: TSV / whitespace-delimited
lines = agent_text.strip().splitlines()
header_idx = None
for i, ln in enumerate(lines):
if ("#" in ln) and ("Topic" in ln or "Topic Label" in ln):
header_idx = i
break
if header_idx is None:
return []
header_cells = re.split(r"\t| {2,}", lines[header_idx].strip())
data_lines = lines[header_idx+1:]
else:
# header exists as markdown table; collect following '|' rows
header_cells = [c.strip() for c in lines[header_idx].strip().strip("|").split("|")]
data_lines = []
# skip possible separator row like |---|
j = header_idx + 1
if j < len(lines) and re.match(r"^\|[-\s:|]+\|$", lines[j].strip()):
j += 1
while j < len(lines) and lines[j].strip().startswith("|"):
data_lines.append(lines[j])
j += 1
# Map header indices
header_map = {}
for idx, h in enumerate(header_cells):
key = h.lower()
if "#" in key:
header_map["#"] = idx
elif "cluster" in key and "id" in key:
header_map["Cluster ID"] = idx
elif "topic" in key and "label" in key:
header_map["Topic Label"] = idx
elif "evidence" in key:
header_map["Top Evidence"] = idx
elif "sentence" in key:
header_map["Sentences"] = idx
elif "paper" in key:
header_map["Papers"] = idx
elif "approve" in key:
header_map["Approve"] = idx
elif "rename" in key:
header_map["Rename To"] = idx
elif "reason" in key:
header_map["Reasoning"] = idx
rows = []
for ln in data_lines:
cells = [c.strip() for c in ln.strip().strip("|").split("|")] if ln.strip().startswith("|") else re.split(r"\t| {2,}", ln.strip())
if len(cells) < 2:
continue
row = {"#": "", "Topic Label": "", "Top Evidence": "", "Sentences": "", "Papers": "", "Approve": False, "Rename To": "", "Reasoning": ""}
def safe_get(idx):
try:
return cells[idx]
except Exception:
return ""
if "#" in header_map:
row["#"] = safe_get(header_map["#"]) or safe_get(0)
if "Cluster ID" in header_map:
row["Cluster ID"] = safe_get(header_map["Cluster ID"]) or ""
if "Topic Label" in header_map:
row["Topic Label"] = safe_get(header_map["Topic Label"]) or safe_get(1)
if "Top Evidence" in header_map:
row["Top Evidence"] = safe_get(header_map["Top Evidence"]) or ""
if "Sentences" in header_map:
row["Sentences"] = safe_get(header_map["Sentences"]) or ""
if "Papers" in header_map:
row["Papers"] = safe_get(header_map["Papers"]) or ""
if "Approve" in header_map:
val = safe_get(header_map["Approve"]).lower()
row["Approve"] = val in ("true","yes","β
","1","y","approve")
if "Rename To" in header_map:
row["Rename To"] = safe_get(header_map["Rename To"]) or ""
if "Reasoning" in header_map:
row["Reasoning"] = safe_get(header_map["Reasoning"]) or ""
rows.append(row)
return rows
raw_rows = table_pattern.group(2).strip().splitlines()
rows = []
def _parse_row(line):
cells = list(map(str.strip, line.strip("|").split("|")))
if len(cells) >= 8:
return {
"#": cells[0],
"Topic Label": cells[1],
"Top Evidence": cells[2],
"Sentences": cells[3],
"Papers": cells[4],
"Approve": cells[5].lower() in ("true", "yes", "β
", "1"),
"Rename To": cells[6],
"Reasoning": cells[7],
}
return None
parsed = list(map(_parse_row, raw_rows))
cleaned = list(filter(lambda r: r is not None, parsed))
return cleaned
def get_agent_state(thread_id: str) -> dict:
"""Return the current memory state for a given thread."""
config = {"configurable": {"thread_id": thread_id}}
return memory.get(config) or {}
def run_agent(user_message: str, context: dict, chat_history: list):
"""
Invoke the agent with a user message and return:
(response_text, review_table_data, phase_bar_html)
Parameters
----------
user_message : str
The user's message or [REVIEW_TABLE_SUBMITTED] payload.
context : dict
Must include 'file_path' and 'thread_id'.
chat_history : list
List of (human, ai) tuples for context.
"""
file_path = context.get("file_path", "")
thread_id = context.get("thread_id", "thread-001")
# Quick shortcut: if user requests to start Phase 2, build a review table
# directly from outputs/summaries_abstract.json to avoid LLM calls.
if user_message.strip().lower().startswith("start phase 2"):
summaries_path = "outputs/summaries_abstract.json"
if not os.path.exists(summaries_path):
return (
"Summaries not found. Run BERTopic discovery first (Phase 2).",
[],
_parse_phase_progress("[PHASE_PROGRESS: P1=done, P2=pending, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]")
)
with open(summaries_path, encoding="utf-8") as f:
summaries = json.load(f)
# sort by size desc and take top 20
top = sorted(summaries, key=lambda s: s.get("size", 0), reverse=True)[:20]
# build markdown table
md_lines = [
"| # | Cluster ID | Topic Label | Top Evidence | Sentences | Papers | Approve | Rename To | Reasoning |",
"|---|------------|-------------|--------------|-----------|--------|---------|-----------|-----------|",
]
for i, s in enumerate(top, start=1):
top_ev = "; ".join(s.get("top_sentences", [])[:2])
row = f"| {i} | {s.get('cluster_id')} | {s.get('label','')} | {top_ev} | {s.get('size',0)} | {len(s.get('papers',[]))} | β
| | |"
md_lines.append(row)
md_table = "\n".join(md_lines)
phase_html = _parse_phase_progress("[PHASE_PROGRESS: P1=done, P2=done, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]")
# _parse_phase_progress can return (html, clean) tuple
if isinstance(phase_html, tuple):
phase_html = phase_html[0]
review_data = _build_review_table(md_table)
return md_table, review_data, phase_html
if not os.environ.get("MISTRAL_API_KEY"):
return (
"Mistral API key is missing. Set the `MISTRAL_API_KEY` environment variable, "
"restart the app, and then try again.",
[],
_parse_phase_progress(""),
)
# Prepend file path hint if present
full_message = (
f"[FILE_PATH: {file_path}]\n{user_message}"
if file_path
else user_message
)
config = {"configurable": {"thread_id": thread_id}}
try:
response = agent.invoke({"messages": [("human", full_message)]}, config=config)
ai_text = response["messages"][-1].content
except Exception as exc:
return (
f"Agent execution failed: {exc}",
[],
_parse_phase_progress(""),
)
# Parse phase progress bar
parsed = _parse_phase_progress(ai_text)
if isinstance(parsed, tuple):
phase_html, clean_text = parsed
else:
phase_html = parsed
clean_text = ai_text
# Parse review table if present
review_data = _build_review_table(clean_text)
# Fallback: if agent didn't emit a markdown review table but summaries exist,
# populate the review table from outputs/summaries_abstract.json so the UI
# shows a usable table for Phase 2 review.
if not review_data:
summaries_path = "outputs/summaries_abstract.json"
if os.path.exists(summaries_path):
try:
with open(summaries_path) as f:
summaries = json.load(f)
rows = []
for s in summaries:
rows.append({
"#": s.get("cluster_id", ""),
"Topic Label": s.get("label", ""),
"Top Evidence": ("; ").join(s.get("top_sentences", [])[:2]),
"Sentences": s.get("size", 0),
"Papers": len(s.get("papers", [])),
"Approve": False,
"Rename To": "",
"Reasoning": "",
})
review_data = rows
except Exception:
review_data = []
return clean_text, review_data, phase_html
|