Spaces:
Sleeping
Sleeping
| """ | |
| agent.py — LangGraph ReAct Agent for BERTopic Agentic Thematic Analysis | |
| Uses ChatMistralAI + MemorySaver + all 7 tools from tools.py | |
| """ | |
| import json | |
| import os | |
| import re | |
| import pandas as pd | |
| from langgraph.prebuilt import create_react_agent | |
| from langgraph.checkpoint.memory import MemorySaver | |
| from langchain_mistralai import ChatMistralAI | |
| from tools import ( | |
| load_scopus_csv, | |
| run_bertopic_discovery, | |
| label_topics_with_llm, | |
| consolidate_into_themes, | |
| compare_with_taxonomy, | |
| generate_comparison_csv, | |
| export_narrative, | |
| ) | |
| llm = ChatMistralAI( | |
| model="mistral-large-latest", | |
| temperature=0.2, | |
| api_key=os.environ.get("MISTRAL_API_KEY", ""), | |
| ) | |
| memory = MemorySaver() | |
| SYSTEM_PROMPT = """ | |
| You are an expert computational thematic analysis agent. You follow Braun & Clarke (2006) | |
| six-phase thematic analysis methodology, adapted for computational corpus analysis using | |
| BERTopic with sentence-transformer embeddings and agglomerative clustering. | |
| 1. load_scopus_csv(file_path: str) | |
| → Load the CSV. Count papers, abstract sentences, title sentences. | |
| → Strip boilerplate text from abstracts. | |
| → Saves cleaned_data.json to outputs/. | |
| → Input: absolute file path string. | |
| 2. run_bertopic_discovery(run_config: str) | |
| → Embeds sentences using all-MiniLM-L6-v2. | |
| → Clusters with AgglomerativeClustering (cosine, threshold=0.7). | |
| → Extracts 5 nearest evidence sentences per cluster. | |
| → Saves summaries_{tag}.json, embeddings_{tag}.npy, and 2 chart HTML files. | |
| → Input JSON: {"columns": ["Abstract"]} or {"columns": ["Title"]} | |
| → Run TWICE: once for Abstract (tag=abstract), once for Title (tag=title). | |
| 3. label_topics_with_llm(labelling_input: str) | |
| → You (the LLM) read the top_sentences for each cluster from summaries_{tag}.json, | |
| then SELF-SUPPLY the llm_labels list with your best label, category, | |
| confidence (0–1), and reasoning for each cluster. | |
| → Input JSON: { | |
| "tag": "abstract", | |
| "llm_labels": [ | |
| {"cluster_id": 0, "label": "AI in Healthcare", "category": "Applied AI", | |
| "confidence": 0.92, "reasoning": "Sentences discuss medical diagnostics..."}, | |
| ... | |
| ] | |
| } | |
| 4. consolidate_into_themes(consolidation_input: str) | |
| → Applies user approvals from the Review Table. | |
| → Merges approved clusters into final themes with final labels. | |
| → Saves themes_{tag}.json and chart_keywords.html. | |
| → Input JSON: { | |
| "tag": "abstract", | |
| "approvals": [ | |
| {"cluster_id": 0, "approved": true, "rename_to": "AI in Medicine", | |
| "reasoning": "Covers core domain"}, | |
| ... | |
| ] | |
| } | |
| 5. compare_with_taxonomy(taxonomy_input: str) | |
| → Maps each final theme to the PAJAIS taxonomy. | |
| → Marks each theme as MAPPED or NOVEL. | |
| → You self-supply the mappings list. | |
| → Input JSON: { | |
| "tag": "abstract", | |
| "mappings": [ | |
| {"final_label": "AI in Medicine", "pajais_category": "Healthcare IS", | |
| "mapped": true}, | |
| ... | |
| ] | |
| } | |
| 6. generate_comparison_csv(comparison_input: str) | |
| → Generates side-by-side CSV and Plotly chart comparing abstract vs title themes. | |
| → Input JSON: {"tags": ["abstract", "title"]} | |
| 7. export_narrative(narrative_input: str) | |
| → You write the ~500-word Section 7 narrative yourself. | |
| → Input JSON: { | |
| "tag": "abstract", | |
| "narrative": "...(your 500-word narrative here)...", | |
| "researcher_name": "..." | |
| } | |
| ════════════════════════════════════════════════════════════════ | |
| RUN CONFIGURATIONS | |
| ════════════════════════════════════════════════════════════════ | |
| • Abstract run: columns = ["Abstract"] → tag = "abstract" | |
| • Title run: columns = ["Title"] → tag = "title" | |
| Always run BERTopic for BOTH configurations before Phase 3. | |
| ════════════════════════════════════════════════════════════════ | |
| BRAUN & CLARKE 6-PHASE WORKFLOW | |
| ════════════════════════════════════════════════════════════════ | |
| PHASE 1 — FAMILIARISATION | |
| Goal: Understand the dataset. | |
| Action: | |
| 1. Call load_scopus_csv(file_path) with the uploaded file path. | |
| 2. Report: total papers, abstract sentences, title sentences, column list. | |
| 3. Show 5 sample titles. | |
| STOP after Phase 1. Say: | |
| "✅ Phase 1 complete. Familiarisation done. Say 'Start Phase 2' to begin coding." | |
| ────────────────────────────────────────────────────────────── | |
| PHASE 2 — INITIAL CODING | |
| Goal: Generate initial semantic codes (clusters) from the corpus. | |
| Actions: | |
| 1. Call run_bertopic_discovery({"columns": ["Abstract"]}) | |
| 2. Call run_bertopic_discovery({"columns": ["Title"]}) | |
| 3. Read outputs/summaries_abstract.json — list ALL cluster IDs and their top 2 sentences. | |
| 4. Analyse each cluster's top_sentences yourself. | |
| 5. Call label_topics_with_llm with your self-generated labels for the ABSTRACT run. | |
| 6. Call label_topics_with_llm with your self-generated labels for the TITLE run. | |
| 7. Build and present a REVIEW TABLE for the user (for abstract clusters): | |
| Columns: [#, Topic Label, Top Evidence, Sentences, Papers, Approve, Rename To, Reasoning] | |
| Fill Approve=True for confident clusters, Approve=False for weak/duplicate ones. | |
| *** STOP GATE AFTER PHASE 2 *** | |
| Say: "⏸️ STOP — Phase 2 complete. Review the table above. | |
| Edit Approve/Rename To/Reasoning columns, then click Submit Review to proceed to Phase 3." | |
| ────────────────────────────────────────────────────────────── | |
| PHASE 3 — SEARCHING FOR THEMES | |
| Goal: Group related codes into broader themes. | |
| Trigger: User submits the review table (message begins with [REVIEW_TABLE_SUBMITTED]). | |
| Actions: | |
| 1. Parse the JSON review table from the user's message. | |
| 2. Call consolidate_into_themes with the parsed approvals for "abstract". | |
| 3. Call consolidate_into_themes with approvals for "title" (approve all by default). | |
| 4. Report the final theme list with counts. | |
| *** STOP GATE AFTER PHASE 3 *** | |
| Say: "⏸️ STOP — Phase 3 complete. [N] themes consolidated. | |
| Review the theme list above. Say 'Proceed to Phase 4' when satisfied." | |
| ────────────────────────────────────────────────────────────── | |
| PHASE 4 — REVIEWING THEMES | |
| Goal: Theoretical saturation check. | |
| Actions: | |
| 1. Analyse theme sizes and sentence counts. | |
| 2. Flag any theme with fewer than 3 sentences as POTENTIALLY WEAK. | |
| 3. Flag any two themes sharing >60% of their top keywords as POTENTIALLY OVERLAPPING. | |
| 4. Report saturation status: SATURATED or REQUIRES REVISION. | |
| 5. Recommend merges or splits if needed. | |
| *** STOP GATE AFTER PHASE 4 *** | |
| Say: "⏸️ STOP — Phase 4 complete. Saturation analysis done. | |
| Say 'Proceed to Phase 5' to finalise theme names." | |
| ────────────────────────────────────────────────────────────── | |
| PHASE 5 — DEFINING AND NAMING THEMES | |
| Goal: Finalize descriptive theme names and definitions. | |
| Actions: | |
| 1. For each theme, write a 1-sentence definition. | |
| 2. Present final theme names and definitions in a clean table. | |
| 3. Confirm with user. | |
| (No STOP gate — flows directly into Phase 5.5) | |
| ────────────────────────────────────────────────────────────── | |
| PHASE 5.5 — PAJAIS TAXONOMY MAPPING | |
| Goal: Position themes within the IS research landscape. | |
| Actions: | |
| 1. Call compare_with_taxonomy for the abstract run — self-supply your mappings. | |
| 2. Call compare_with_taxonomy for the title run — self-supply your mappings. | |
| 3. Present a table: Theme | PAJAIS Category | Status (MAPPED/NOVEL). | |
| *** STOP GATE AFTER PHASE 5.5 *** | |
| Say: "⏸️ STOP — Phase 5.5 complete. PAJAIS mapping done. | |
| Say 'Generate Final Report' to proceed to Phase 6." | |
| ────────────────────────────────────────────────────────────── | |
| PHASE 6 — WRITING UP (REPORT) | |
| Goal: Generate the final deliverables. | |
| Actions: | |
| 1. Call generate_comparison_csv({"tags": ["abstract", "title"]}) | |
| 2. Write a ~500-word academic narrative (Section 7) covering: | |
| - Research context | |
| - Summary of each theme with evidence | |
| - Comparison of abstract vs title themes | |
| - PAJAIS taxonomy positioning | |
| - Implications for IS research | |
| 3. Call export_narrative with your narrative text. | |
| 4. Tell the user: outputs are in the outputs/ folder, click Refresh Downloads. | |
| ════════════════════════════════════════════════════════════════ | |
| STRICT BEHAVIOURAL RULES | |
| ════════════════════════════════════════════════════════════════ | |
| • ONE PHASE PER MESSAGE. Never jump ahead. | |
| • At each STOP gate, wait for explicit user confirmation before proceeding. | |
| • Never skip a phase. | |
| • Always self-supply data for label_topics_with_llm, compare_with_taxonomy, | |
| and export_narrative — do not ask the user for these. | |
| • When the user submits a review table ([REVIEW_TABLE_SUBMITTED]), parse it | |
| and call consolidate_into_themes immediately. | |
| • Be concise. Avoid repeating instructions. | |
| • If a tool returns an error, report it clearly and ask the user how to proceed. | |
| • Keep all intermediate files in the outputs/ directory. | |
| ════════════════════════════════════════════════════════════════ | |
| PHASE PROGRESS HTML FORMAT | |
| ════════════════════════════════════════════════════════════════ | |
| After completing each phase, include in your response: | |
| [PHASE_PROGRESS: P1=done, P2=done, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending] | |
| (Replace 'done'/'pending' accurately for the current state.) | |
| """ | |
| # ─── Agent ──────────────────────────────────────────────────────────────────── | |
| tools_list = [ | |
| load_scopus_csv, | |
| run_bertopic_discovery, | |
| label_topics_with_llm, | |
| consolidate_into_themes, | |
| compare_with_taxonomy, | |
| generate_comparison_csv, | |
| export_narrative, | |
| ] | |
| agent = create_react_agent( | |
| model=llm, | |
| tools=tools_list, | |
| checkpointer=memory, | |
| prompt=SYSTEM_PROMPT, | |
| ) | |
| # ─── Helpers for app.py ─────────────────────────────────────────────────────── | |
| def _parse_phase_progress(text: str) -> str: | |
| """Extract PHASE_PROGRESS tag from agent response and render as HTML.""" | |
| match = re.search(r"\[PHASE_PROGRESS:(.*?)\]", text, re.DOTALL) | |
| status_map = { | |
| "done": ("✅", "#22c55e"), | |
| "pending": ("⬜", "#94a3b8"), | |
| "active": ("🔄", "#3b82f6"), | |
| } | |
| labels = ["P1", "P2", "P3", "P4", "P5", "P5.5", "P6"] | |
| if not match: | |
| return "<div style='padding:10px;background:#f0f4ff;border-radius:8px'>" \ | |
| "<b>Phase Progress:</b> " + \ | |
| " ".join(f"<span style='margin-left:8px'>⬜ {l}</span>" for l in labels) + \ | |
| "</div>" | |
| progress_str = match.group(1) | |
| state = {} | |
| for part in progress_str.split(","): | |
| part = part.strip() | |
| kv = part.split("=") | |
| if len(kv) == 2: | |
| state[kv[0].strip()] = kv[1].strip() | |
| def _badge(label): | |
| s = state.get(label, "pending") | |
| icon, color = status_map.get(s, ("⬜", "#94a3b8")) | |
| return (f"<span style='margin-left:8px;color:{color};font-weight:600'>" | |
| f"{icon} {label}</span>") | |
| badges = "".join(map(_badge, labels)) | |
| clean = re.sub(r"\[PHASE_PROGRESS:.*?\]", "", text, flags=re.DOTALL).strip() | |
| return ( | |
| "<div style='padding:10px;background:#f0f4ff;border-radius:8px;" | |
| "font-family:sans-serif'>" | |
| f"<b>Phase Progress:</b>{badges}</div>", | |
| clean | |
| ) | |
| def _build_review_table(agent_text: str) -> list: | |
| """ | |
| Parse a markdown table from the agent response into a list of dicts | |
| for the Gradio Dataframe review table. | |
| """ | |
| lines = agent_text.splitlines() | |
| # Find markdown table header line (starts with '|' and contains # and Topic) | |
| header_idx = None | |
| for i, ln in enumerate(lines): | |
| if ln.strip().startswith("|") and ("#" in ln) and ("Topic" in ln or "Topic Label" in ln): | |
| header_idx = i | |
| break | |
| if header_idx is None: | |
| # Fallback: TSV / whitespace-delimited | |
| lines = agent_text.strip().splitlines() | |
| header_idx = None | |
| for i, ln in enumerate(lines): | |
| if ("#" in ln) and ("Topic" in ln or "Topic Label" in ln): | |
| header_idx = i | |
| break | |
| if header_idx is None: | |
| return [] | |
| header_cells = re.split(r"\t| {2,}", lines[header_idx].strip()) | |
| data_lines = lines[header_idx+1:] | |
| else: | |
| # header exists as markdown table; collect following '|' rows | |
| header_cells = [c.strip() for c in lines[header_idx].strip().strip("|").split("|")] | |
| data_lines = [] | |
| # skip possible separator row like |---| | |
| j = header_idx + 1 | |
| if j < len(lines) and re.match(r"^\|[-\s:|]+\|$", lines[j].strip()): | |
| j += 1 | |
| while j < len(lines) and lines[j].strip().startswith("|"): | |
| data_lines.append(lines[j]) | |
| j += 1 | |
| # Map header indices | |
| header_map = {} | |
| for idx, h in enumerate(header_cells): | |
| key = h.lower() | |
| if "#" in key: | |
| header_map["#"] = idx | |
| elif "cluster" in key and "id" in key: | |
| header_map["Cluster ID"] = idx | |
| elif "topic" in key and "label" in key: | |
| header_map["Topic Label"] = idx | |
| elif "evidence" in key: | |
| header_map["Top Evidence"] = idx | |
| elif "sentence" in key: | |
| header_map["Sentences"] = idx | |
| elif "paper" in key: | |
| header_map["Papers"] = idx | |
| elif "approve" in key: | |
| header_map["Approve"] = idx | |
| elif "rename" in key: | |
| header_map["Rename To"] = idx | |
| elif "reason" in key: | |
| header_map["Reasoning"] = idx | |
| rows = [] | |
| for ln in data_lines: | |
| cells = [c.strip() for c in ln.strip().strip("|").split("|")] if ln.strip().startswith("|") else re.split(r"\t| {2,}", ln.strip()) | |
| if len(cells) < 2: | |
| continue | |
| row = {"#": "", "Topic Label": "", "Top Evidence": "", "Sentences": "", "Papers": "", "Approve": False, "Rename To": "", "Reasoning": ""} | |
| def safe_get(idx): | |
| try: | |
| return cells[idx] | |
| except Exception: | |
| return "" | |
| if "#" in header_map: | |
| row["#"] = safe_get(header_map["#"]) or safe_get(0) | |
| if "Cluster ID" in header_map: | |
| row["Cluster ID"] = safe_get(header_map["Cluster ID"]) or "" | |
| if "Topic Label" in header_map: | |
| row["Topic Label"] = safe_get(header_map["Topic Label"]) or safe_get(1) | |
| if "Top Evidence" in header_map: | |
| row["Top Evidence"] = safe_get(header_map["Top Evidence"]) or "" | |
| if "Sentences" in header_map: | |
| row["Sentences"] = safe_get(header_map["Sentences"]) or "" | |
| if "Papers" in header_map: | |
| row["Papers"] = safe_get(header_map["Papers"]) or "" | |
| if "Approve" in header_map: | |
| val = safe_get(header_map["Approve"]).lower() | |
| row["Approve"] = val in ("true","yes","✅","1","y","approve") | |
| if "Rename To" in header_map: | |
| row["Rename To"] = safe_get(header_map["Rename To"]) or "" | |
| if "Reasoning" in header_map: | |
| row["Reasoning"] = safe_get(header_map["Reasoning"]) or "" | |
| rows.append(row) | |
| return rows | |
| raw_rows = table_pattern.group(2).strip().splitlines() | |
| rows = [] | |
| def _parse_row(line): | |
| cells = list(map(str.strip, line.strip("|").split("|"))) | |
| if len(cells) >= 8: | |
| return { | |
| "#": cells[0], | |
| "Topic Label": cells[1], | |
| "Top Evidence": cells[2], | |
| "Sentences": cells[3], | |
| "Papers": cells[4], | |
| "Approve": cells[5].lower() in ("true", "yes", "✅", "1"), | |
| "Rename To": cells[6], | |
| "Reasoning": cells[7], | |
| } | |
| return None | |
| parsed = list(map(_parse_row, raw_rows)) | |
| cleaned = list(filter(lambda r: r is not None, parsed)) | |
| return cleaned | |
| def get_agent_state(thread_id: str) -> dict: | |
| """Return the current memory state for a given thread.""" | |
| config = {"configurable": {"thread_id": thread_id}} | |
| return memory.get(config) or {} | |
| def run_agent(user_message: str, context: dict, chat_history: list): | |
| """ | |
| Invoke the agent with a user message and return: | |
| (response_text, review_table_data, phase_bar_html) | |
| Parameters | |
| ---------- | |
| user_message : str | |
| The user's message or [REVIEW_TABLE_SUBMITTED] payload. | |
| context : dict | |
| Must include 'file_path' and 'thread_id'. | |
| chat_history : list | |
| List of (human, ai) tuples for context. | |
| """ | |
| file_path = context.get("file_path", "") | |
| thread_id = context.get("thread_id", "thread-001") | |
| # Quick shortcut: if user requests to start Phase 2, build a review table | |
| # directly from outputs/summaries_abstract.json to avoid LLM calls. | |
| if user_message.strip().lower().startswith("start phase 2"): | |
| summaries_path = "outputs/summaries_abstract.json" | |
| if not os.path.exists(summaries_path): | |
| return ( | |
| "Summaries not found. Run BERTopic discovery first (Phase 2).", | |
| [], | |
| _parse_phase_progress("[PHASE_PROGRESS: P1=done, P2=pending, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]") | |
| ) | |
| with open(summaries_path, encoding="utf-8") as f: | |
| summaries = json.load(f) | |
| # sort by size desc and take top 20 | |
| top = sorted(summaries, key=lambda s: s.get("size", 0), reverse=True)[:20] | |
| # build markdown table | |
| md_lines = [ | |
| "| # | Cluster ID | Topic Label | Top Evidence | Sentences | Papers | Approve | Rename To | Reasoning |", | |
| "|---|------------|-------------|--------------|-----------|--------|---------|-----------|-----------|", | |
| ] | |
| for i, s in enumerate(top, start=1): | |
| top_ev = "; ".join(s.get("top_sentences", [])[:2]) | |
| row = f"| {i} | {s.get('cluster_id')} | {s.get('label','')} | {top_ev} | {s.get('size',0)} | {len(s.get('papers',[]))} | ✅ | | |" | |
| md_lines.append(row) | |
| md_table = "\n".join(md_lines) | |
| phase_html = _parse_phase_progress("[PHASE_PROGRESS: P1=done, P2=done, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]") | |
| # _parse_phase_progress can return (html, clean) tuple | |
| if isinstance(phase_html, tuple): | |
| phase_html = phase_html[0] | |
| review_data = _build_review_table(md_table) | |
| return md_table, review_data, phase_html | |
| if not os.environ.get("MISTRAL_API_KEY"): | |
| return ( | |
| "Mistral API key is missing. Set the `MISTRAL_API_KEY` environment variable, " | |
| "restart the app, and then try again.", | |
| [], | |
| _parse_phase_progress(""), | |
| ) | |
| # Prepend file path hint if present | |
| full_message = ( | |
| f"[FILE_PATH: {file_path}]\n{user_message}" | |
| if file_path | |
| else user_message | |
| ) | |
| config = {"configurable": {"thread_id": thread_id}} | |
| try: | |
| response = agent.invoke({"messages": [("human", full_message)]}, config=config) | |
| ai_text = response["messages"][-1].content | |
| except Exception as exc: | |
| return ( | |
| f"Agent execution failed: {exc}", | |
| [], | |
| _parse_phase_progress(""), | |
| ) | |
| # Parse phase progress bar | |
| parsed = _parse_phase_progress(ai_text) | |
| if isinstance(parsed, tuple): | |
| phase_html, clean_text = parsed | |
| else: | |
| phase_html = parsed | |
| clean_text = ai_text | |
| # Parse review table if present | |
| review_data = _build_review_table(clean_text) | |
| # Fallback: if agent didn't emit a markdown review table but summaries exist, | |
| # populate the review table from outputs/summaries_abstract.json so the UI | |
| # shows a usable table for Phase 2 review. | |
| if not review_data: | |
| summaries_path = "outputs/summaries_abstract.json" | |
| if os.path.exists(summaries_path): | |
| try: | |
| with open(summaries_path) as f: | |
| summaries = json.load(f) | |
| rows = [] | |
| for s in summaries: | |
| rows.append({ | |
| "#": s.get("cluster_id", ""), | |
| "Topic Label": s.get("label", ""), | |
| "Top Evidence": ("; ").join(s.get("top_sentences", [])[:2]), | |
| "Sentences": s.get("size", 0), | |
| "Papers": len(s.get("papers", [])), | |
| "Approve": False, | |
| "Rename To": "", | |
| "Reasoning": "", | |
| }) | |
| review_data = rows | |
| except Exception: | |
| review_data = [] | |
| return clean_text, review_data, phase_html | |