File size: 22,368 Bytes
ccab3d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
"""
agent.py β€” LangGraph ReAct Agent for BERTopic Agentic Thematic Analysis
Uses ChatMistralAI + MemorySaver + all 7 tools from tools.py
"""

import json
import os
import re
import pandas as pd
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_mistralai import ChatMistralAI
from tools import (
    load_scopus_csv,
    run_bertopic_discovery,
    label_topics_with_llm,
    consolidate_into_themes,
    compare_with_taxonomy,
    generate_comparison_csv,
    export_narrative,
)

llm = ChatMistralAI(
    model="mistral-large-latest",
    temperature=0.2,
    api_key=os.environ.get("MISTRAL_API_KEY", ""),
)

memory = MemorySaver()

SYSTEM_PROMPT = """
You are an expert computational thematic analysis agent. You follow Braun & Clarke (2006)
six-phase thematic analysis methodology, adapted for computational corpus analysis using
BERTopic with sentence-transformer embeddings and agglomerative clustering.

1. load_scopus_csv(file_path: str)
   β†’ Load the CSV. Count papers, abstract sentences, title sentences.
   β†’ Strip boilerplate text from abstracts.
   β†’ Saves cleaned_data.json to outputs/.
   β†’ Input: absolute file path string.

2. run_bertopic_discovery(run_config: str)
   β†’ Embeds sentences using all-MiniLM-L6-v2.
   β†’ Clusters with AgglomerativeClustering (cosine, threshold=0.7).
   β†’ Extracts 5 nearest evidence sentences per cluster.
   β†’ Saves summaries_{tag}.json, embeddings_{tag}.npy, and 2 chart HTML files.
   β†’ Input JSON: {"columns": ["Abstract"]} or {"columns": ["Title"]}
   β†’ Run TWICE: once for Abstract (tag=abstract), once for Title (tag=title).

3. label_topics_with_llm(labelling_input: str)
   β†’ You (the LLM) read the top_sentences for each cluster from summaries_{tag}.json,
     then SELF-SUPPLY the llm_labels list with your best label, category,
     confidence (0–1), and reasoning for each cluster.
   β†’ Input JSON: {
       "tag": "abstract",
       "llm_labels": [
         {"cluster_id": 0, "label": "AI in Healthcare", "category": "Applied AI",
          "confidence": 0.92, "reasoning": "Sentences discuss medical diagnostics..."},
         ...
       ]
     }

4. consolidate_into_themes(consolidation_input: str)
   β†’ Applies user approvals from the Review Table.
   β†’ Merges approved clusters into final themes with final labels.
   β†’ Saves themes_{tag}.json and chart_keywords.html.
   β†’ Input JSON: {
       "tag": "abstract",
       "approvals": [
         {"cluster_id": 0, "approved": true, "rename_to": "AI in Medicine",
          "reasoning": "Covers core domain"},
         ...
       ]
     }

5. compare_with_taxonomy(taxonomy_input: str)
   β†’ Maps each final theme to the PAJAIS taxonomy.
   β†’ Marks each theme as MAPPED or NOVEL.
   β†’ You self-supply the mappings list.
   β†’ Input JSON: {
       "tag": "abstract",
       "mappings": [
         {"final_label": "AI in Medicine", "pajais_category": "Healthcare IS",
          "mapped": true},
         ...
       ]
     }

6. generate_comparison_csv(comparison_input: str)
   β†’ Generates side-by-side CSV and Plotly chart comparing abstract vs title themes.
   β†’ Input JSON: {"tags": ["abstract", "title"]}

7. export_narrative(narrative_input: str)
   β†’ You write the ~500-word Section 7 narrative yourself.
   β†’ Input JSON: {
       "tag": "abstract",
       "narrative": "...(your 500-word narrative here)...",
       "researcher_name": "..."
     }

════════════════════════════════════════════════════════════════
RUN CONFIGURATIONS
════════════════════════════════════════════════════════════════
β€’ Abstract run: columns = ["Abstract"]   β†’ tag = "abstract"
β€’ Title run:    columns = ["Title"]      β†’ tag = "title"
Always run BERTopic for BOTH configurations before Phase 3.

════════════════════════════════════════════════════════════════
BRAUN & CLARKE 6-PHASE WORKFLOW
════════════════════════════════════════════════════════════════

PHASE 1 β€” FAMILIARISATION
  Goal: Understand the dataset.
  Action:
    1. Call load_scopus_csv(file_path) with the uploaded file path.
    2. Report: total papers, abstract sentences, title sentences, column list.
    3. Show 5 sample titles.
  STOP after Phase 1. Say:
  "βœ… Phase 1 complete. Familiarisation done. Say 'Start Phase 2' to begin coding."

──────────────────────────────────────────────────────────────

PHASE 2 β€” INITIAL CODING
  Goal: Generate initial semantic codes (clusters) from the corpus.
  Actions:
    1. Call run_bertopic_discovery({"columns": ["Abstract"]})
    2. Call run_bertopic_discovery({"columns": ["Title"]})
    3. Read outputs/summaries_abstract.json β€” list ALL cluster IDs and their top 2 sentences.
    4. Analyse each cluster's top_sentences yourself.
    5. Call label_topics_with_llm with your self-generated labels for the ABSTRACT run.
    6. Call label_topics_with_llm with your self-generated labels for the TITLE run.
    7. Build and present a REVIEW TABLE for the user (for abstract clusters):
       Columns: [#, Topic Label, Top Evidence, Sentences, Papers, Approve, Rename To, Reasoning]
       Fill Approve=True for confident clusters, Approve=False for weak/duplicate ones.
  *** STOP GATE AFTER PHASE 2 ***
  Say: "⏸️ STOP β€” Phase 2 complete. Review the table above.
  Edit Approve/Rename To/Reasoning columns, then click Submit Review to proceed to Phase 3."

──────────────────────────────────────────────────────────────

PHASE 3 β€” SEARCHING FOR THEMES
  Goal: Group related codes into broader themes.
  Trigger: User submits the review table (message begins with [REVIEW_TABLE_SUBMITTED]).
  Actions:
    1. Parse the JSON review table from the user's message.
    2. Call consolidate_into_themes with the parsed approvals for "abstract".
    3. Call consolidate_into_themes with approvals for "title" (approve all by default).
    4. Report the final theme list with counts.
  *** STOP GATE AFTER PHASE 3 ***
  Say: "⏸️ STOP β€” Phase 3 complete. [N] themes consolidated.
  Review the theme list above. Say 'Proceed to Phase 4' when satisfied."

──────────────────────────────────────────────────────────────

PHASE 4 β€” REVIEWING THEMES
  Goal: Theoretical saturation check.
  Actions:
    1. Analyse theme sizes and sentence counts.
    2. Flag any theme with fewer than 3 sentences as POTENTIALLY WEAK.
    3. Flag any two themes sharing >60% of their top keywords as POTENTIALLY OVERLAPPING.
    4. Report saturation status: SATURATED or REQUIRES REVISION.
    5. Recommend merges or splits if needed.
  *** STOP GATE AFTER PHASE 4 ***
  Say: "⏸️ STOP β€” Phase 4 complete. Saturation analysis done.
  Say 'Proceed to Phase 5' to finalise theme names."

──────────────────────────────────────────────────────────────

PHASE 5 β€” DEFINING AND NAMING THEMES
  Goal: Finalize descriptive theme names and definitions.
  Actions:
    1. For each theme, write a 1-sentence definition.
    2. Present final theme names and definitions in a clean table.
    3. Confirm with user.
  (No STOP gate β€” flows directly into Phase 5.5)

──────────────────────────────────────────────────────────────

PHASE 5.5 β€” PAJAIS TAXONOMY MAPPING
  Goal: Position themes within the IS research landscape.
  Actions:
    1. Call compare_with_taxonomy for the abstract run β€” self-supply your mappings.
    2. Call compare_with_taxonomy for the title run β€” self-supply your mappings.
    3. Present a table: Theme | PAJAIS Category | Status (MAPPED/NOVEL).
  *** STOP GATE AFTER PHASE 5.5 ***
  Say: "⏸️ STOP β€” Phase 5.5 complete. PAJAIS mapping done.
  Say 'Generate Final Report' to proceed to Phase 6."

──────────────────────────────────────────────────────────────

PHASE 6 β€” WRITING UP (REPORT)
  Goal: Generate the final deliverables.
  Actions:
    1. Call generate_comparison_csv({"tags": ["abstract", "title"]})
    2. Write a ~500-word academic narrative (Section 7) covering:
       - Research context
       - Summary of each theme with evidence
       - Comparison of abstract vs title themes
       - PAJAIS taxonomy positioning
       - Implications for IS research
    3. Call export_narrative with your narrative text.
    4. Tell the user: outputs are in the outputs/ folder, click Refresh Downloads.

════════════════════════════════════════════════════════════════
STRICT BEHAVIOURAL RULES
════════════════════════════════════════════════════════════════

β€’ ONE PHASE PER MESSAGE. Never jump ahead.
β€’ At each STOP gate, wait for explicit user confirmation before proceeding.
β€’ Never skip a phase.
β€’ Always self-supply data for label_topics_with_llm, compare_with_taxonomy,
  and export_narrative β€” do not ask the user for these.
β€’ When the user submits a review table ([REVIEW_TABLE_SUBMITTED]), parse it
  and call consolidate_into_themes immediately.
β€’ Be concise. Avoid repeating instructions.
β€’ If a tool returns an error, report it clearly and ask the user how to proceed.
β€’ Keep all intermediate files in the outputs/ directory.

════════════════════════════════════════════════════════════════
PHASE PROGRESS HTML FORMAT
════════════════════════════════════════════════════════════════
After completing each phase, include in your response:
[PHASE_PROGRESS: P1=done, P2=done, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]
(Replace 'done'/'pending' accurately for the current state.)
"""

# ─── Agent ────────────────────────────────────────────────────────────────────
tools_list = [
    load_scopus_csv,
    run_bertopic_discovery,
    label_topics_with_llm,
    consolidate_into_themes,
    compare_with_taxonomy,
    generate_comparison_csv,
    export_narrative,
]

agent = create_react_agent(
    model=llm,
    tools=tools_list,
    checkpointer=memory,
    prompt=SYSTEM_PROMPT,
)

# ─── Helpers for app.py ───────────────────────────────────────────────────────

def _parse_phase_progress(text: str) -> str:
    """Extract PHASE_PROGRESS tag from agent response and render as HTML."""
    match = re.search(r"\[PHASE_PROGRESS:(.*?)\]", text, re.DOTALL)
    status_map = {
        "done":    ("βœ…", "#22c55e"),
        "pending": ("⬜", "#94a3b8"),
        "active":  ("πŸ”„", "#3b82f6"),
    }
    labels = ["P1", "P2", "P3", "P4", "P5", "P5.5", "P6"]

    if not match:
        return "<div style='padding:10px;background:#f0f4ff;border-radius:8px'>" \
               "<b>Phase Progress:</b> " + \
               " ".join(f"<span style='margin-left:8px'>⬜ {l}</span>" for l in labels) + \
               "</div>"

    progress_str = match.group(1)
    state = {}
    for part in progress_str.split(","):
        part = part.strip()
        kv   = part.split("=")
        if len(kv) == 2:
            state[kv[0].strip()] = kv[1].strip()

    def _badge(label):
        s = state.get(label, "pending")
        icon, color = status_map.get(s, ("⬜", "#94a3b8"))
        return (f"<span style='margin-left:8px;color:{color};font-weight:600'>"
                f"{icon} {label}</span>")

    badges = "".join(map(_badge, labels))
    clean  = re.sub(r"\[PHASE_PROGRESS:.*?\]", "", text, flags=re.DOTALL).strip()
    return (
        "<div style='padding:10px;background:#f0f4ff;border-radius:8px;"
        "font-family:sans-serif'>"
        f"<b>Phase Progress:</b>{badges}</div>",
        clean
    )


def _build_review_table(agent_text: str) -> list:
    """
    Parse a markdown table from the agent response into a list of dicts
    for the Gradio Dataframe review table.
    """
    lines = agent_text.splitlines()
    # Find markdown table header line (starts with '|' and contains # and Topic)
    header_idx = None
    for i, ln in enumerate(lines):
      if ln.strip().startswith("|") and ("#" in ln) and ("Topic" in ln or "Topic Label" in ln):
        header_idx = i
        break
    if header_idx is None:
      # Fallback: TSV / whitespace-delimited
      lines = agent_text.strip().splitlines()
      header_idx = None
      for i, ln in enumerate(lines):
        if ("#" in ln) and ("Topic" in ln or "Topic Label" in ln):
          header_idx = i
          break
      if header_idx is None:
        return []
      header_cells = re.split(r"\t| {2,}", lines[header_idx].strip())
      data_lines = lines[header_idx+1:]
    else:
      # header exists as markdown table; collect following '|' rows
      header_cells = [c.strip() for c in lines[header_idx].strip().strip("|").split("|")]
      data_lines = []
      # skip possible separator row like |---|
      j = header_idx + 1
      if j < len(lines) and re.match(r"^\|[-\s:|]+\|$", lines[j].strip()):
        j += 1
      while j < len(lines) and lines[j].strip().startswith("|"):
        data_lines.append(lines[j])
        j += 1

    # Map header indices
    header_map = {}
    for idx, h in enumerate(header_cells):
      key = h.lower()
      if "#" in key:
        header_map["#"] = idx
      elif "cluster" in key and "id" in key:
        header_map["Cluster ID"] = idx
      elif "topic" in key and "label" in key:
        header_map["Topic Label"] = idx
      elif "evidence" in key:
        header_map["Top Evidence"] = idx
      elif "sentence" in key:
        header_map["Sentences"] = idx
      elif "paper" in key:
        header_map["Papers"] = idx
      elif "approve" in key:
        header_map["Approve"] = idx
      elif "rename" in key:
        header_map["Rename To"] = idx
      elif "reason" in key:
        header_map["Reasoning"] = idx

    rows = []
    for ln in data_lines:
      cells = [c.strip() for c in ln.strip().strip("|").split("|")] if ln.strip().startswith("|") else re.split(r"\t| {2,}", ln.strip())
      if len(cells) < 2:
        continue
      row = {"#": "", "Topic Label": "", "Top Evidence": "", "Sentences": "", "Papers": "", "Approve": False, "Rename To": "", "Reasoning": ""}
      def safe_get(idx):
        try:
          return cells[idx]
        except Exception:
          return ""
      if "#" in header_map:
        row["#"] = safe_get(header_map["#"]) or safe_get(0)
      if "Cluster ID" in header_map:
        row["Cluster ID"] = safe_get(header_map["Cluster ID"]) or ""
      if "Topic Label" in header_map:
        row["Topic Label"] = safe_get(header_map["Topic Label"]) or safe_get(1)
      if "Top Evidence" in header_map:
        row["Top Evidence"] = safe_get(header_map["Top Evidence"]) or ""
      if "Sentences" in header_map:
        row["Sentences"] = safe_get(header_map["Sentences"]) or ""
      if "Papers" in header_map:
        row["Papers"] = safe_get(header_map["Papers"]) or ""
      if "Approve" in header_map:
        val = safe_get(header_map["Approve"]).lower()
        row["Approve"] = val in ("true","yes","βœ…","1","y","approve")
      if "Rename To" in header_map:
        row["Rename To"] = safe_get(header_map["Rename To"]) or ""
      if "Reasoning" in header_map:
        row["Reasoning"] = safe_get(header_map["Reasoning"]) or ""
      rows.append(row)
    return rows

    raw_rows = table_pattern.group(2).strip().splitlines()
    rows     = []

    def _parse_row(line):
        cells = list(map(str.strip, line.strip("|").split("|")))
        if len(cells) >= 8:
            return {
                "#":            cells[0],
                "Topic Label":  cells[1],
                "Top Evidence": cells[2],
                "Sentences":    cells[3],
                "Papers":       cells[4],
                "Approve":      cells[5].lower() in ("true", "yes", "βœ…", "1"),
                "Rename To":    cells[6],
                "Reasoning":    cells[7],
            }
        return None

    parsed  = list(map(_parse_row, raw_rows))
    cleaned = list(filter(lambda r: r is not None, parsed))
    return cleaned


def get_agent_state(thread_id: str) -> dict:
    """Return the current memory state for a given thread."""
    config = {"configurable": {"thread_id": thread_id}}
    return memory.get(config) or {}


def run_agent(user_message: str, context: dict, chat_history: list):
    """
    Invoke the agent with a user message and return:
      (response_text, review_table_data, phase_bar_html)

    Parameters
    ----------
    user_message : str
        The user's message or [REVIEW_TABLE_SUBMITTED] payload.
    context : dict
        Must include 'file_path' and 'thread_id'.
    chat_history : list
        List of (human, ai) tuples for context.
    """
    file_path = context.get("file_path", "")
    thread_id = context.get("thread_id", "thread-001")
    # Quick shortcut: if user requests to start Phase 2, build a review table
    # directly from outputs/summaries_abstract.json to avoid LLM calls.
    if user_message.strip().lower().startswith("start phase 2"):
      summaries_path = "outputs/summaries_abstract.json"
      if not os.path.exists(summaries_path):
        return (
          "Summaries not found. Run BERTopic discovery first (Phase 2).",
          [],
          _parse_phase_progress("[PHASE_PROGRESS: P1=done, P2=pending, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]")
        )

      with open(summaries_path, encoding="utf-8") as f:
        summaries = json.load(f)

      # sort by size desc and take top 20
      top = sorted(summaries, key=lambda s: s.get("size", 0), reverse=True)[:20]

      # build markdown table
      md_lines = [
        "| # | Cluster ID | Topic Label | Top Evidence | Sentences | Papers | Approve | Rename To | Reasoning |",
        "|---|------------|-------------|--------------|-----------|--------|---------|-----------|-----------|",
      ]
      for i, s in enumerate(top, start=1):
        top_ev = "; ".join(s.get("top_sentences", [])[:2])
        row = f"| {i} | {s.get('cluster_id')} | {s.get('label','')} | {top_ev} | {s.get('size',0)} | {len(s.get('papers',[]))} | βœ… |  |  |"
        md_lines.append(row)

      md_table = "\n".join(md_lines)
      phase_html = _parse_phase_progress("[PHASE_PROGRESS: P1=done, P2=done, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]")
      # _parse_phase_progress can return (html, clean) tuple
      if isinstance(phase_html, tuple):
        phase_html = phase_html[0]

      review_data = _build_review_table(md_table)
      return md_table, review_data, phase_html
    if not os.environ.get("MISTRAL_API_KEY"):
        return (
            "Mistral API key is missing. Set the `MISTRAL_API_KEY` environment variable, "
            "restart the app, and then try again.",
            [],
            _parse_phase_progress(""),
        )

    # Prepend file path hint if present
    full_message = (
        f"[FILE_PATH: {file_path}]\n{user_message}"
        if file_path
        else user_message
    )

    config   = {"configurable": {"thread_id": thread_id}}
    try:
        response = agent.invoke({"messages": [("human", full_message)]}, config=config)
        ai_text  = response["messages"][-1].content
    except Exception as exc:
        return (
            f"Agent execution failed: {exc}",
            [],
            _parse_phase_progress(""),
        )

    # Parse phase progress bar
    parsed = _parse_phase_progress(ai_text)
    if isinstance(parsed, tuple):
        phase_html, clean_text = parsed
    else:
        phase_html  = parsed
        clean_text  = ai_text

    # Parse review table if present
    review_data = _build_review_table(clean_text)

    # Fallback: if agent didn't emit a markdown review table but summaries exist,
    # populate the review table from outputs/summaries_abstract.json so the UI
    # shows a usable table for Phase 2 review.
    if not review_data:
      summaries_path = "outputs/summaries_abstract.json"
      if os.path.exists(summaries_path):
        try:
          with open(summaries_path) as f:
            summaries = json.load(f)
          rows = []
          for s in summaries:
            rows.append({
              "#": s.get("cluster_id", ""),
              "Topic Label": s.get("label", ""),
              "Top Evidence": ("; ").join(s.get("top_sentences", [])[:2]),
              "Sentences": s.get("size", 0),
              "Papers": len(s.get("papers", [])),
              "Approve": False,
              "Rename To": "",
              "Reasoning": "",
            })
          review_data = rows
        except Exception:
          review_data = []

    return clean_text, review_data, phase_html