Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- agent.py +521 -0
- app.py +259 -0
- requirements.txt +13 -0
- tools.py +544 -0
agent.py
ADDED
|
@@ -0,0 +1,521 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
agent.py β LangGraph ReAct Agent for BERTopic Agentic Thematic Analysis
|
| 3 |
+
Uses ChatMistralAI + MemorySaver + all 7 tools from tools.py
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from langgraph.prebuilt import create_react_agent
|
| 11 |
+
from langgraph.checkpoint.memory import MemorySaver
|
| 12 |
+
from langchain_mistralai import ChatMistralAI
|
| 13 |
+
from tools import (
|
| 14 |
+
load_scopus_csv,
|
| 15 |
+
run_bertopic_discovery,
|
| 16 |
+
label_topics_with_llm,
|
| 17 |
+
consolidate_into_themes,
|
| 18 |
+
compare_with_taxonomy,
|
| 19 |
+
generate_comparison_csv,
|
| 20 |
+
export_narrative,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
llm = ChatMistralAI(
|
| 24 |
+
model="mistral-large-latest",
|
| 25 |
+
temperature=0.2,
|
| 26 |
+
api_key=os.environ.get("MISTRAL_API_KEY", ""),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
memory = MemorySaver()
|
| 30 |
+
|
| 31 |
+
SYSTEM_PROMPT = """
|
| 32 |
+
You are an expert computational thematic analysis agent. You follow Braun & Clarke (2006)
|
| 33 |
+
six-phase thematic analysis methodology, adapted for computational corpus analysis using
|
| 34 |
+
BERTopic with sentence-transformer embeddings and agglomerative clustering.
|
| 35 |
+
|
| 36 |
+
1. load_scopus_csv(file_path: str)
|
| 37 |
+
β Load the CSV. Count papers, abstract sentences, title sentences.
|
| 38 |
+
β Strip boilerplate text from abstracts.
|
| 39 |
+
β Saves cleaned_data.json to outputs/.
|
| 40 |
+
β Input: absolute file path string.
|
| 41 |
+
|
| 42 |
+
2. run_bertopic_discovery(run_config: str)
|
| 43 |
+
β Embeds sentences using all-MiniLM-L6-v2.
|
| 44 |
+
β Clusters with AgglomerativeClustering (cosine, threshold=0.7).
|
| 45 |
+
β Extracts 5 nearest evidence sentences per cluster.
|
| 46 |
+
β Saves summaries_{tag}.json, embeddings_{tag}.npy, and 2 chart HTML files.
|
| 47 |
+
β Input JSON: {"columns": ["Abstract"]} or {"columns": ["Title"]}
|
| 48 |
+
β Run TWICE: once for Abstract (tag=abstract), once for Title (tag=title).
|
| 49 |
+
|
| 50 |
+
3. label_topics_with_llm(labelling_input: str)
|
| 51 |
+
β You (the LLM) read the top_sentences for each cluster from summaries_{tag}.json,
|
| 52 |
+
then SELF-SUPPLY the llm_labels list with your best label, category,
|
| 53 |
+
confidence (0β1), and reasoning for each cluster.
|
| 54 |
+
β Input JSON: {
|
| 55 |
+
"tag": "abstract",
|
| 56 |
+
"llm_labels": [
|
| 57 |
+
{"cluster_id": 0, "label": "AI in Healthcare", "category": "Applied AI",
|
| 58 |
+
"confidence": 0.92, "reasoning": "Sentences discuss medical diagnostics..."},
|
| 59 |
+
...
|
| 60 |
+
]
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
4. consolidate_into_themes(consolidation_input: str)
|
| 64 |
+
β Applies user approvals from the Review Table.
|
| 65 |
+
β Merges approved clusters into final themes with final labels.
|
| 66 |
+
β Saves themes_{tag}.json and chart_keywords.html.
|
| 67 |
+
β Input JSON: {
|
| 68 |
+
"tag": "abstract",
|
| 69 |
+
"approvals": [
|
| 70 |
+
{"cluster_id": 0, "approved": true, "rename_to": "AI in Medicine",
|
| 71 |
+
"reasoning": "Covers core domain"},
|
| 72 |
+
...
|
| 73 |
+
]
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
5. compare_with_taxonomy(taxonomy_input: str)
|
| 77 |
+
β Maps each final theme to the PAJAIS taxonomy.
|
| 78 |
+
β Marks each theme as MAPPED or NOVEL.
|
| 79 |
+
β You self-supply the mappings list.
|
| 80 |
+
β Input JSON: {
|
| 81 |
+
"tag": "abstract",
|
| 82 |
+
"mappings": [
|
| 83 |
+
{"final_label": "AI in Medicine", "pajais_category": "Healthcare IS",
|
| 84 |
+
"mapped": true},
|
| 85 |
+
...
|
| 86 |
+
]
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
6. generate_comparison_csv(comparison_input: str)
|
| 90 |
+
β Generates side-by-side CSV and Plotly chart comparing abstract vs title themes.
|
| 91 |
+
β Input JSON: {"tags": ["abstract", "title"]}
|
| 92 |
+
|
| 93 |
+
7. export_narrative(narrative_input: str)
|
| 94 |
+
β You write the ~500-word Section 7 narrative yourself.
|
| 95 |
+
β Input JSON: {
|
| 96 |
+
"tag": "abstract",
|
| 97 |
+
"narrative": "...(your 500-word narrative here)...",
|
| 98 |
+
"researcher_name": "..."
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
+
RUN CONFIGURATIONS
|
| 103 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 104 |
+
β’ Abstract run: columns = ["Abstract"] β tag = "abstract"
|
| 105 |
+
β’ Title run: columns = ["Title"] β tag = "title"
|
| 106 |
+
Always run BERTopic for BOTH configurations before Phase 3.
|
| 107 |
+
|
| 108 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 109 |
+
BRAUN & CLARKE 6-PHASE WORKFLOW
|
| 110 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 111 |
+
|
| 112 |
+
PHASE 1 β FAMILIARISATION
|
| 113 |
+
Goal: Understand the dataset.
|
| 114 |
+
Action:
|
| 115 |
+
1. Call load_scopus_csv(file_path) with the uploaded file path.
|
| 116 |
+
2. Report: total papers, abstract sentences, title sentences, column list.
|
| 117 |
+
3. Show 5 sample titles.
|
| 118 |
+
STOP after Phase 1. Say:
|
| 119 |
+
"β
Phase 1 complete. Familiarisation done. Say 'Start Phase 2' to begin coding."
|
| 120 |
+
|
| 121 |
+
βββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½ββββββββββββββββββββ
|
| 122 |
+
|
| 123 |
+
PHASE 2 β INITIAL CODING
|
| 124 |
+
Goal: Generate initial semantic codes (clusters) from the corpus.
|
| 125 |
+
Actions:
|
| 126 |
+
1. Call run_bertopic_discovery({"columns": ["Abstract"]})
|
| 127 |
+
2. Call run_bertopic_discovery({"columns": ["Title"]})
|
| 128 |
+
3. Read outputs/summaries_abstract.json β list ALL cluster IDs and their top 2 sentences.
|
| 129 |
+
4. Analyse each cluster's top_sentences yourself.
|
| 130 |
+
5. Call label_topics_with_llm with your self-generated labels for the ABSTRACT run.
|
| 131 |
+
6. Call label_topics_with_llm with your self-generated labels for the TITLE run.
|
| 132 |
+
7. Build and present a REVIEW TABLE for the user (for abstract clusters):
|
| 133 |
+
Columns: [#, Topic Label, Top Evidence, Sentences, Papers, Approve, Rename To, Reasoning]
|
| 134 |
+
Fill Approve=True for confident clusters, Approve=False for weak/duplicate ones.
|
| 135 |
+
*** STOP GATE AFTER PHASE 2 ***
|
| 136 |
+
Say: "βΈοΈ STOP β Phase 2 complete. Review the table above.
|
| 137 |
+
Edit Approve/Rename To/Reasoning columns, then click Submit Review to proceed to Phase 3."
|
| 138 |
+
|
| 139 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 140 |
+
|
| 141 |
+
PHASE 3 β SEARCHING FOR THEMES
|
| 142 |
+
Goal: Group related codes into broader themes.
|
| 143 |
+
Trigger: User submits the review table (message begins with [REVIEW_TABLE_SUBMITTED]).
|
| 144 |
+
Actions:
|
| 145 |
+
1. Parse the JSON review table from the user's message.
|
| 146 |
+
2. Call consolidate_into_themes with the parsed approvals for "abstract".
|
| 147 |
+
3. Call consolidate_into_themes with approvals for "title" (approve all by default).
|
| 148 |
+
4. Report the final theme list with counts.
|
| 149 |
+
*** STOP GATE AFTER PHASE 3 ***
|
| 150 |
+
Say: "βΈοΈ STOP β Phase 3 complete. [N] themes consolidated.
|
| 151 |
+
Review the theme list above. Say 'Proceed to Phase 4' when satisfied."
|
| 152 |
+
|
| 153 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 154 |
+
|
| 155 |
+
PHASE 4 β REVIEWING THEMES
|
| 156 |
+
Goal: Theoretical saturation check.
|
| 157 |
+
Actions:
|
| 158 |
+
1. Analyse theme sizes and sentence counts.
|
| 159 |
+
2. Flag any theme with fewer than 3 sentences as POTENTIALLY WEAK.
|
| 160 |
+
3. Flag any two themes sharing >60% of their top keywords as POTENTIALLY OVERLAPPING.
|
| 161 |
+
4. Report saturation status: SATURATED or REQUIRES REVISION.
|
| 162 |
+
5. Recommend merges or splits if needed.
|
| 163 |
+
*** STOP GATE AFTER PHASE 4 ***
|
| 164 |
+
Say: "βΈοΈ STOP β Phase 4 complete. Saturation analysis done.
|
| 165 |
+
Say 'Proceed to Phase 5' to finalise theme names."
|
| 166 |
+
|
| 167 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 168 |
+
|
| 169 |
+
PHASE 5 β DEFINING AND NAMING THEMES
|
| 170 |
+
Goal: Finalize descriptive theme names and definitions.
|
| 171 |
+
Actions:
|
| 172 |
+
1. For each theme, write a 1-sentence definition.
|
| 173 |
+
2. Present final theme names and definitions in a clean table.
|
| 174 |
+
3. Confirm with user.
|
| 175 |
+
(No STOP gate β flows directly into Phase 5.5)
|
| 176 |
+
|
| 177 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 178 |
+
|
| 179 |
+
PHASE 5.5 β PAJAIS TAXONOMY MAPPING
|
| 180 |
+
Goal: Position themes within the IS research landscape.
|
| 181 |
+
Actions:
|
| 182 |
+
1. Call compare_with_taxonomy for the abstract run β self-supply your mappings.
|
| 183 |
+
2. Call compare_with_taxonomy for the title run β self-supply your mappings.
|
| 184 |
+
3. Present a table: Theme | PAJAIS Category | Status (MAPPED/NOVEL).
|
| 185 |
+
*** STOP GATE AFTER PHASE 5.5 ***
|
| 186 |
+
Say: "βΈοΈ STOP β Phase 5.5 complete. PAJAIS mapping done.
|
| 187 |
+
Say 'Generate Final Report' to proceed to Phase 6."
|
| 188 |
+
|
| 189 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 190 |
+
|
| 191 |
+
PHASE 6 β WRITING UP (REPORT)
|
| 192 |
+
Goal: Generate the final deliverables.
|
| 193 |
+
Actions:
|
| 194 |
+
1. Call generate_comparison_csv({"tags": ["abstract", "title"]})
|
| 195 |
+
2. Write a ~500-word academic narrative (Section 7) covering:
|
| 196 |
+
- Research context
|
| 197 |
+
- Summary of each theme with evidence
|
| 198 |
+
- Comparison of abstract vs title themes
|
| 199 |
+
- PAJAIS taxonomy positioning
|
| 200 |
+
- Implications for IS research
|
| 201 |
+
3. Call export_narrative with your narrative text.
|
| 202 |
+
4. Tell the user: outputs are in the outputs/ folder, click Refresh Downloads.
|
| 203 |
+
|
| 204 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 205 |
+
STRICT BEHAVIOURAL RULES
|
| 206 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 207 |
+
|
| 208 |
+
β’ ONE PHASE PER MESSAGE. Never jump ahead.
|
| 209 |
+
β’ At each STOP gate, wait for explicit user confirmation before proceeding.
|
| 210 |
+
β’ Never skip a phase.
|
| 211 |
+
β’ Always self-supply data for label_topics_with_llm, compare_with_taxonomy,
|
| 212 |
+
and export_narrative β do not ask the user for these.
|
| 213 |
+
β’ When the user submits a review table ([REVIEW_TABLE_SUBMITTED]), parse it
|
| 214 |
+
and call consolidate_into_themes immediately.
|
| 215 |
+
β’ Be concise. Avoid repeating instructions.
|
| 216 |
+
β’ If a tool returns an error, report it clearly and ask the user how to proceed.
|
| 217 |
+
β’ Keep all intermediate files in the outputs/ directory.
|
| 218 |
+
|
| 219 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 220 |
+
PHASE PROGRESS HTML FORMAT
|
| 221 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 222 |
+
After completing each phase, include in your response:
|
| 223 |
+
[PHASE_PROGRESS: P1=done, P2=done, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]
|
| 224 |
+
(Replace 'done'/'pending' accurately for the current state.)
|
| 225 |
+
"""
|
| 226 |
+
|
| 227 |
+
# βββ Agent ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 228 |
+
tools_list = [
|
| 229 |
+
load_scopus_csv,
|
| 230 |
+
run_bertopic_discovery,
|
| 231 |
+
label_topics_with_llm,
|
| 232 |
+
consolidate_into_themes,
|
| 233 |
+
compare_with_taxonomy,
|
| 234 |
+
generate_comparison_csv,
|
| 235 |
+
export_narrative,
|
| 236 |
+
]
|
| 237 |
+
|
| 238 |
+
agent = create_react_agent(
|
| 239 |
+
model=llm,
|
| 240 |
+
tools=tools_list,
|
| 241 |
+
checkpointer=memory,
|
| 242 |
+
prompt=SYSTEM_PROMPT,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
# βββ Helpers for app.py βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 246 |
+
|
| 247 |
+
def _parse_phase_progress(text: str) -> str:
|
| 248 |
+
"""Extract PHASE_PROGRESS tag from agent response and render as HTML."""
|
| 249 |
+
match = re.search(r"\[PHASE_PROGRESS:(.*?)\]", text, re.DOTALL)
|
| 250 |
+
status_map = {
|
| 251 |
+
"done": ("β
", "#22c55e"),
|
| 252 |
+
"pending": ("β¬", "#94a3b8"),
|
| 253 |
+
"active": ("π", "#3b82f6"),
|
| 254 |
+
}
|
| 255 |
+
labels = ["P1", "P2", "P3", "P4", "P5", "P5.5", "P6"]
|
| 256 |
+
|
| 257 |
+
if not match:
|
| 258 |
+
return "<div style='padding:10px;background:#f0f4ff;border-radius:8px'>" \
|
| 259 |
+
"<b>Phase Progress:</b> " + \
|
| 260 |
+
" ".join(f"<span style='margin-left:8px'>β¬ {l}</span>" for l in labels) + \
|
| 261 |
+
"</div>"
|
| 262 |
+
|
| 263 |
+
progress_str = match.group(1)
|
| 264 |
+
state = {}
|
| 265 |
+
for part in progress_str.split(","):
|
| 266 |
+
part = part.strip()
|
| 267 |
+
kv = part.split("=")
|
| 268 |
+
if len(kv) == 2:
|
| 269 |
+
state[kv[0].strip()] = kv[1].strip()
|
| 270 |
+
|
| 271 |
+
def _badge(label):
|
| 272 |
+
s = state.get(label, "pending")
|
| 273 |
+
icon, color = status_map.get(s, ("β¬", "#94a3b8"))
|
| 274 |
+
return (f"<span style='margin-left:8px;color:{color};font-weight:600'>"
|
| 275 |
+
f"{icon} {label}</span>")
|
| 276 |
+
|
| 277 |
+
badges = "".join(map(_badge, labels))
|
| 278 |
+
clean = re.sub(r"\[PHASE_PROGRESS:.*?\]", "", text, flags=re.DOTALL).strip()
|
| 279 |
+
return (
|
| 280 |
+
"<div style='padding:10px;background:#f0f4ff;border-radius:8px;"
|
| 281 |
+
"font-family:sans-serif'>"
|
| 282 |
+
f"<b>Phase Progress:</b>{badges}</div>",
|
| 283 |
+
clean
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def _build_review_table(agent_text: str) -> list:
|
| 288 |
+
"""
|
| 289 |
+
Parse a markdown table from the agent response into a list of dicts
|
| 290 |
+
for the Gradio Dataframe review table.
|
| 291 |
+
"""
|
| 292 |
+
lines = agent_text.splitlines()
|
| 293 |
+
# Find markdown table header line (starts with '|' and contains # and Topic)
|
| 294 |
+
header_idx = None
|
| 295 |
+
for i, ln in enumerate(lines):
|
| 296 |
+
if ln.strip().startswith("|") and ("#" in ln) and ("Topic" in ln or "Topic Label" in ln):
|
| 297 |
+
header_idx = i
|
| 298 |
+
break
|
| 299 |
+
if header_idx is None:
|
| 300 |
+
# Fallback: TSV / whitespace-delimited
|
| 301 |
+
lines = agent_text.strip().splitlines()
|
| 302 |
+
header_idx = None
|
| 303 |
+
for i, ln in enumerate(lines):
|
| 304 |
+
if ("#" in ln) and ("Topic" in ln or "Topic Label" in ln):
|
| 305 |
+
header_idx = i
|
| 306 |
+
break
|
| 307 |
+
if header_idx is None:
|
| 308 |
+
return []
|
| 309 |
+
header_cells = re.split(r"\t| {2,}", lines[header_idx].strip())
|
| 310 |
+
data_lines = lines[header_idx+1:]
|
| 311 |
+
else:
|
| 312 |
+
# header exists as markdown table; collect following '|' rows
|
| 313 |
+
header_cells = [c.strip() for c in lines[header_idx].strip().strip("|").split("|")]
|
| 314 |
+
data_lines = []
|
| 315 |
+
# skip possible separator row like |---|
|
| 316 |
+
j = header_idx + 1
|
| 317 |
+
if j < len(lines) and re.match(r"^\|[-\s:|]+\|$", lines[j].strip()):
|
| 318 |
+
j += 1
|
| 319 |
+
while j < len(lines) and lines[j].strip().startswith("|"):
|
| 320 |
+
data_lines.append(lines[j])
|
| 321 |
+
j += 1
|
| 322 |
+
|
| 323 |
+
# Map header indices
|
| 324 |
+
header_map = {}
|
| 325 |
+
for idx, h in enumerate(header_cells):
|
| 326 |
+
key = h.lower()
|
| 327 |
+
if "#" in key:
|
| 328 |
+
header_map["#"] = idx
|
| 329 |
+
elif "cluster" in key and "id" in key:
|
| 330 |
+
header_map["Cluster ID"] = idx
|
| 331 |
+
elif "topic" in key and "label" in key:
|
| 332 |
+
header_map["Topic Label"] = idx
|
| 333 |
+
elif "evidence" in key:
|
| 334 |
+
header_map["Top Evidence"] = idx
|
| 335 |
+
elif "sentence" in key:
|
| 336 |
+
header_map["Sentences"] = idx
|
| 337 |
+
elif "paper" in key:
|
| 338 |
+
header_map["Papers"] = idx
|
| 339 |
+
elif "approve" in key:
|
| 340 |
+
header_map["Approve"] = idx
|
| 341 |
+
elif "rename" in key:
|
| 342 |
+
header_map["Rename To"] = idx
|
| 343 |
+
elif "reason" in key:
|
| 344 |
+
header_map["Reasoning"] = idx
|
| 345 |
+
|
| 346 |
+
rows = []
|
| 347 |
+
for ln in data_lines:
|
| 348 |
+
cells = [c.strip() for c in ln.strip().strip("|").split("|")] if ln.strip().startswith("|") else re.split(r"\t| {2,}", ln.strip())
|
| 349 |
+
if len(cells) < 2:
|
| 350 |
+
continue
|
| 351 |
+
row = {"#": "", "Topic Label": "", "Top Evidence": "", "Sentences": "", "Papers": "", "Approve": False, "Rename To": "", "Reasoning": ""}
|
| 352 |
+
def safe_get(idx):
|
| 353 |
+
try:
|
| 354 |
+
return cells[idx]
|
| 355 |
+
except Exception:
|
| 356 |
+
return ""
|
| 357 |
+
if "#" in header_map:
|
| 358 |
+
row["#"] = safe_get(header_map["#"]) or safe_get(0)
|
| 359 |
+
if "Cluster ID" in header_map:
|
| 360 |
+
row["Cluster ID"] = safe_get(header_map["Cluster ID"]) or ""
|
| 361 |
+
if "Topic Label" in header_map:
|
| 362 |
+
row["Topic Label"] = safe_get(header_map["Topic Label"]) or safe_get(1)
|
| 363 |
+
if "Top Evidence" in header_map:
|
| 364 |
+
row["Top Evidence"] = safe_get(header_map["Top Evidence"]) or ""
|
| 365 |
+
if "Sentences" in header_map:
|
| 366 |
+
row["Sentences"] = safe_get(header_map["Sentences"]) or ""
|
| 367 |
+
if "Papers" in header_map:
|
| 368 |
+
row["Papers"] = safe_get(header_map["Papers"]) or ""
|
| 369 |
+
if "Approve" in header_map:
|
| 370 |
+
val = safe_get(header_map["Approve"]).lower()
|
| 371 |
+
row["Approve"] = val in ("true","yes","β
","1","y","approve")
|
| 372 |
+
if "Rename To" in header_map:
|
| 373 |
+
row["Rename To"] = safe_get(header_map["Rename To"]) or ""
|
| 374 |
+
if "Reasoning" in header_map:
|
| 375 |
+
row["Reasoning"] = safe_get(header_map["Reasoning"]) or ""
|
| 376 |
+
rows.append(row)
|
| 377 |
+
return rows
|
| 378 |
+
|
| 379 |
+
raw_rows = table_pattern.group(2).strip().splitlines()
|
| 380 |
+
rows = []
|
| 381 |
+
|
| 382 |
+
def _parse_row(line):
|
| 383 |
+
cells = list(map(str.strip, line.strip("|").split("|")))
|
| 384 |
+
if len(cells) >= 8:
|
| 385 |
+
return {
|
| 386 |
+
"#": cells[0],
|
| 387 |
+
"Topic Label": cells[1],
|
| 388 |
+
"Top Evidence": cells[2],
|
| 389 |
+
"Sentences": cells[3],
|
| 390 |
+
"Papers": cells[4],
|
| 391 |
+
"Approve": cells[5].lower() in ("true", "yes", "β
", "1"),
|
| 392 |
+
"Rename To": cells[6],
|
| 393 |
+
"Reasoning": cells[7],
|
| 394 |
+
}
|
| 395 |
+
return None
|
| 396 |
+
|
| 397 |
+
parsed = list(map(_parse_row, raw_rows))
|
| 398 |
+
cleaned = list(filter(lambda r: r is not None, parsed))
|
| 399 |
+
return cleaned
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
def get_agent_state(thread_id: str) -> dict:
|
| 403 |
+
"""Return the current memory state for a given thread."""
|
| 404 |
+
config = {"configurable": {"thread_id": thread_id}}
|
| 405 |
+
return memory.get(config) or {}
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
def run_agent(user_message: str, context: dict, chat_history: list):
|
| 409 |
+
"""
|
| 410 |
+
Invoke the agent with a user message and return:
|
| 411 |
+
(response_text, review_table_data, phase_bar_html)
|
| 412 |
+
|
| 413 |
+
Parameters
|
| 414 |
+
----------
|
| 415 |
+
user_message : str
|
| 416 |
+
The user's message or [REVIEW_TABLE_SUBMITTED] payload.
|
| 417 |
+
context : dict
|
| 418 |
+
Must include 'file_path' and 'thread_id'.
|
| 419 |
+
chat_history : list
|
| 420 |
+
List of (human, ai) tuples for context.
|
| 421 |
+
"""
|
| 422 |
+
file_path = context.get("file_path", "")
|
| 423 |
+
thread_id = context.get("thread_id", "thread-001")
|
| 424 |
+
# Quick shortcut: if user requests to start Phase 2, build a review table
|
| 425 |
+
# directly from outputs/summaries_abstract.json to avoid LLM calls.
|
| 426 |
+
if user_message.strip().lower().startswith("start phase 2"):
|
| 427 |
+
summaries_path = "outputs/summaries_abstract.json"
|
| 428 |
+
if not os.path.exists(summaries_path):
|
| 429 |
+
return (
|
| 430 |
+
"Summaries not found. Run BERTopic discovery first (Phase 2).",
|
| 431 |
+
[],
|
| 432 |
+
_parse_phase_progress("[PHASE_PROGRESS: P1=done, P2=pending, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]")
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
with open(summaries_path, encoding="utf-8") as f:
|
| 436 |
+
summaries = json.load(f)
|
| 437 |
+
|
| 438 |
+
# sort by size desc and take top 20
|
| 439 |
+
top = sorted(summaries, key=lambda s: s.get("size", 0), reverse=True)[:20]
|
| 440 |
+
|
| 441 |
+
# build markdown table
|
| 442 |
+
md_lines = [
|
| 443 |
+
"| # | Cluster ID | Topic Label | Top Evidence | Sentences | Papers | Approve | Rename To | Reasoning |",
|
| 444 |
+
"|---|------------|-------------|--------------|-----------|--------|---------|-----------|-----------|",
|
| 445 |
+
]
|
| 446 |
+
for i, s in enumerate(top, start=1):
|
| 447 |
+
top_ev = "; ".join(s.get("top_sentences", [])[:2])
|
| 448 |
+
row = f"| {i} | {s.get('cluster_id')} | {s.get('label','')} | {top_ev} | {s.get('size',0)} | {len(s.get('papers',[]))} | β
| | |"
|
| 449 |
+
md_lines.append(row)
|
| 450 |
+
|
| 451 |
+
md_table = "\n".join(md_lines)
|
| 452 |
+
phase_html = _parse_phase_progress("[PHASE_PROGRESS: P1=done, P2=done, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]")
|
| 453 |
+
# _parse_phase_progress can return (html, clean) tuple
|
| 454 |
+
if isinstance(phase_html, tuple):
|
| 455 |
+
phase_html = phase_html[0]
|
| 456 |
+
|
| 457 |
+
review_data = _build_review_table(md_table)
|
| 458 |
+
return md_table, review_data, phase_html
|
| 459 |
+
if not os.environ.get("MISTRAL_API_KEY"):
|
| 460 |
+
return (
|
| 461 |
+
"Mistral API key is missing. Set the `MISTRAL_API_KEY` environment variable, "
|
| 462 |
+
"restart the app, and then try again.",
|
| 463 |
+
[],
|
| 464 |
+
_parse_phase_progress(""),
|
| 465 |
+
)
|
| 466 |
+
|
| 467 |
+
# Prepend file path hint if present
|
| 468 |
+
full_message = (
|
| 469 |
+
f"[FILE_PATH: {file_path}]\n{user_message}"
|
| 470 |
+
if file_path
|
| 471 |
+
else user_message
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
config = {"configurable": {"thread_id": thread_id}}
|
| 475 |
+
try:
|
| 476 |
+
response = agent.invoke({"messages": [("human", full_message)]}, config=config)
|
| 477 |
+
ai_text = response["messages"][-1].content
|
| 478 |
+
except Exception as exc:
|
| 479 |
+
return (
|
| 480 |
+
f"Agent execution failed: {exc}",
|
| 481 |
+
[],
|
| 482 |
+
_parse_phase_progress(""),
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
# Parse phase progress bar
|
| 486 |
+
parsed = _parse_phase_progress(ai_text)
|
| 487 |
+
if isinstance(parsed, tuple):
|
| 488 |
+
phase_html, clean_text = parsed
|
| 489 |
+
else:
|
| 490 |
+
phase_html = parsed
|
| 491 |
+
clean_text = ai_text
|
| 492 |
+
|
| 493 |
+
# Parse review table if present
|
| 494 |
+
review_data = _build_review_table(clean_text)
|
| 495 |
+
|
| 496 |
+
# Fallback: if agent didn't emit a markdown review table but summaries exist,
|
| 497 |
+
# populate the review table from outputs/summaries_abstract.json so the UI
|
| 498 |
+
# shows a usable table for Phase 2 review.
|
| 499 |
+
if not review_data:
|
| 500 |
+
summaries_path = "outputs/summaries_abstract.json"
|
| 501 |
+
if os.path.exists(summaries_path):
|
| 502 |
+
try:
|
| 503 |
+
with open(summaries_path) as f:
|
| 504 |
+
summaries = json.load(f)
|
| 505 |
+
rows = []
|
| 506 |
+
for s in summaries:
|
| 507 |
+
rows.append({
|
| 508 |
+
"#": s.get("cluster_id", ""),
|
| 509 |
+
"Topic Label": s.get("label", ""),
|
| 510 |
+
"Top Evidence": ("; ").join(s.get("top_sentences", [])[:2]),
|
| 511 |
+
"Sentences": s.get("size", 0),
|
| 512 |
+
"Papers": len(s.get("papers", [])),
|
| 513 |
+
"Approve": False,
|
| 514 |
+
"Rename To": "",
|
| 515 |
+
"Reasoning": "",
|
| 516 |
+
})
|
| 517 |
+
review_data = rows
|
| 518 |
+
except Exception:
|
| 519 |
+
review_data = []
|
| 520 |
+
|
| 521 |
+
return clean_text, review_data, phase_html
|
app.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py β Gradio UI for BERTopic Agentic Thematic Analysis
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from agent import run_agent
|
| 8 |
+
|
| 9 |
+
def format_chat_history(history):
|
| 10 |
+
"""Convert list-of-tuples to Gradio chatbot format."""
|
| 11 |
+
# Keep for compatibility; actual normalization happens in handlers.
|
| 12 |
+
return history
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def send_message(user_message, chat_history, file_path, thread_id):
|
| 16 |
+
"""Forward user message to agent and return updated chat + state."""
|
| 17 |
+
if not user_message.strip():
|
| 18 |
+
return chat_history, "", gr.update(), gr.update()
|
| 19 |
+
|
| 20 |
+
# Normalize incoming chat_history (Gradio may provide list of dicts)
|
| 21 |
+
def _to_agent_history(hist):
|
| 22 |
+
if not hist:
|
| 23 |
+
return []
|
| 24 |
+
if isinstance(hist[0], dict):
|
| 25 |
+
agent_hist = []
|
| 26 |
+
i = 0
|
| 27 |
+
while i < len(hist) - 1:
|
| 28 |
+
a, b = hist[i], hist[i+1]
|
| 29 |
+
if a.get("role", "") in ("user", "human") and b.get("role", "") in ("assistant", "ai"):
|
| 30 |
+
agent_hist.append((a.get("content", ""), b.get("content", "")))
|
| 31 |
+
i += 2
|
| 32 |
+
else:
|
| 33 |
+
i += 1
|
| 34 |
+
return agent_hist
|
| 35 |
+
return hist or []
|
| 36 |
+
|
| 37 |
+
def _to_gradio_history_from_agent(hist):
|
| 38 |
+
gr_hist = []
|
| 39 |
+
for t in hist:
|
| 40 |
+
if isinstance(t, (list, tuple)) and len(t) >= 2:
|
| 41 |
+
gr_hist.append({"role": "user", "content": t[0]})
|
| 42 |
+
gr_hist.append({"role": "assistant", "content": t[1]})
|
| 43 |
+
return gr_hist
|
| 44 |
+
|
| 45 |
+
agent_chat_history = _to_agent_history(chat_history)
|
| 46 |
+
context = {"file_path": file_path, "thread_id": thread_id}
|
| 47 |
+
response, review_data, phase_html = run_agent(user_message, context, agent_chat_history)
|
| 48 |
+
|
| 49 |
+
# Build gradio-compatible history
|
| 50 |
+
if isinstance(chat_history, list) and chat_history and isinstance(chat_history[0], dict):
|
| 51 |
+
new_chat = chat_history.copy()
|
| 52 |
+
else:
|
| 53 |
+
new_chat = _to_gradio_history_from_agent(agent_chat_history)
|
| 54 |
+
|
| 55 |
+
new_chat.append({"role": "user", "content": user_message})
|
| 56 |
+
new_chat.append({"role": "assistant", "content": response})
|
| 57 |
+
review_df = pd.DataFrame(review_data) if review_data else pd.DataFrame(
|
| 58 |
+
columns=["#", "Topic Label", "Top Evidence", "Sentences", "Papers",
|
| 59 |
+
"Approve", "Rename To", "Reasoning"]
|
| 60 |
+
)
|
| 61 |
+
return new_chat, "", review_df, phase_html
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def submit_review(review_df, chat_history, file_path, thread_id):
|
| 65 |
+
"""Send the edited review table back to the agent."""
|
| 66 |
+
table_json = review_df.to_json(orient="records")
|
| 67 |
+
review_message = f"[REVIEW_TABLE_SUBMITTED]\n{table_json}"
|
| 68 |
+
context = {"file_path": file_path, "thread_id": thread_id}
|
| 69 |
+
# Normalize incoming history similar to send_message
|
| 70 |
+
def _to_agent_history_for_submit(hist):
|
| 71 |
+
if not hist:
|
| 72 |
+
return []
|
| 73 |
+
if isinstance(hist[0], dict):
|
| 74 |
+
agent_hist = []
|
| 75 |
+
i = 0
|
| 76 |
+
while i < len(hist) - 1:
|
| 77 |
+
a, b = hist[i], hist[i+1]
|
| 78 |
+
if a.get("role", "") in ("user", "human") and b.get("role", "") in ("assistant", "ai"):
|
| 79 |
+
agent_hist.append((a.get("content", ""), b.get("content", "")))
|
| 80 |
+
i += 2
|
| 81 |
+
else:
|
| 82 |
+
i += 1
|
| 83 |
+
return agent_hist
|
| 84 |
+
return hist or []
|
| 85 |
+
|
| 86 |
+
agent_chat_history = _to_agent_history_for_submit(chat_history)
|
| 87 |
+
response, new_review_data, phase_html = run_agent(review_message, context, agent_chat_history)
|
| 88 |
+
|
| 89 |
+
# Build gradio-compatible history
|
| 90 |
+
if isinstance(chat_history, list) and chat_history and isinstance(chat_history[0], dict):
|
| 91 |
+
new_chat = chat_history.copy()
|
| 92 |
+
else:
|
| 93 |
+
def _to_gradio(hist):
|
| 94 |
+
out = []
|
| 95 |
+
for t in (hist or []):
|
| 96 |
+
if isinstance(t, (list, tuple)) and len(t) >= 2:
|
| 97 |
+
out.append({"role": "user", "content": t[0]})
|
| 98 |
+
out.append({"role": "assistant", "content": t[1]})
|
| 99 |
+
return out
|
| 100 |
+
new_chat = _to_gradio(agent_chat_history)
|
| 101 |
+
|
| 102 |
+
new_chat.append({"role": "user", "content": "(Review table submitted)"})
|
| 103 |
+
new_chat.append({"role": "assistant", "content": response})
|
| 104 |
+
new_df = pd.DataFrame(new_review_data) if new_review_data else review_df
|
| 105 |
+
return new_chat, new_df, phase_html
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def get_download_files():
|
| 109 |
+
"""Collect output files available for download."""
|
| 110 |
+
import os, glob
|
| 111 |
+
files = glob.glob("outputs/*.csv") + glob.glob("outputs/*.json") + glob.glob("outputs/*.txt")
|
| 112 |
+
return files if files else None
|
| 113 |
+
|
| 114 |
+
with gr.Blocks(title="BERTopic Agentic Thematic Analysis") as demo:
|
| 115 |
+
thread_id_state = gr.State("thread-001")
|
| 116 |
+
uploaded_path_state = gr.State(None)
|
| 117 |
+
|
| 118 |
+
gr.Markdown(
|
| 119 |
+
"# π¬ BERTopic Agentic Thematic Analysis\n"
|
| 120 |
+
"Upload your Scopus CSV and follow the agent through Braun & Clarke's 6 phases."
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
phase_bar = gr.HTML(
|
| 124 |
+
value="""
|
| 125 |
+
<div style='padding:10px;background:#f0f4ff;border-radius:8px;font-family:sans-serif'>
|
| 126 |
+
<b>Phase Progress:</b>
|
| 127 |
+
<span style='margin-left:12px'>β¬ P1</span>
|
| 128 |
+
<span style='margin-left:8px'>β¬ P2</span>
|
| 129 |
+
<span style='margin-left:8px'>β¬ P3</span>
|
| 130 |
+
<span style='margin-left:8px'>β¬ P4</span>
|
| 131 |
+
<span style='margin-left:8px'>β¬ P5</span>
|
| 132 |
+
<span style='margin-left:8px'>β¬ P5.5</span>
|
| 133 |
+
<span style='margin-left:8px'>β¬ P6</span>
|
| 134 |
+
</div>
|
| 135 |
+
""",
|
| 136 |
+
label="Phase Tracker"
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
with gr.Group():
|
| 140 |
+
gr.Markdown("## π Section 1: Upload Scopus CSV")
|
| 141 |
+
csv_upload = gr.File(
|
| 142 |
+
label="Upload Scopus CSV",
|
| 143 |
+
file_types=[".csv"],
|
| 144 |
+
type="filepath"
|
| 145 |
+
)
|
| 146 |
+
upload_status = gr.Textbox(label="Upload Status", interactive=False)
|
| 147 |
+
|
| 148 |
+
def handle_upload(filepath):
|
| 149 |
+
if filepath is None:
|
| 150 |
+
return "No file uploaded.", None
|
| 151 |
+
return f"β
File loaded: {filepath}", filepath
|
| 152 |
+
|
| 153 |
+
csv_upload.change(
|
| 154 |
+
fn=handle_upload,
|
| 155 |
+
inputs=[csv_upload],
|
| 156 |
+
outputs=[upload_status, uploaded_path_state]
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
with gr.Group():
|
| 160 |
+
gr.Markdown("## π¬ Section 2: Agent Chat")
|
| 161 |
+
gr.Markdown(
|
| 162 |
+
"_Start with:_ **'Start Phase 1'** to begin familiarisation, "
|
| 163 |
+
"then follow the agent's instructions phase by phase."
|
| 164 |
+
)
|
| 165 |
+
chatbot = gr.Chatbot(height=420, label="Agent Conversation")
|
| 166 |
+
with gr.Row():
|
| 167 |
+
user_input = gr.Textbox(
|
| 168 |
+
placeholder="Type your message or command here...",
|
| 169 |
+
label="Your Message",
|
| 170 |
+
scale=5
|
| 171 |
+
)
|
| 172 |
+
send_btn = gr.Button("Send βΆ", variant="primary", scale=1)
|
| 173 |
+
|
| 174 |
+
with gr.Group():
|
| 175 |
+
gr.Markdown("## π Section 3: Results")
|
| 176 |
+
|
| 177 |
+
# Review Table
|
| 178 |
+
gr.Markdown("### ποΈ Topic Review Table")
|
| 179 |
+
gr.Markdown(
|
| 180 |
+
"Edit the **Approve** (True/False), **Rename To**, and **Reasoning** columns, "
|
| 181 |
+
"then click **Submit Review** to proceed."
|
| 182 |
+
)
|
| 183 |
+
review_table = gr.Dataframe(
|
| 184 |
+
headers=["#", "Topic Label", "Top Evidence", "Sentences",
|
| 185 |
+
"Papers", "Approve", "Rename To", "Reasoning"],
|
| 186 |
+
datatype=["number", "str", "str", "number", "number", "bool", "str", "str"],
|
| 187 |
+
interactive=True,
|
| 188 |
+
label="Review Table",
|
| 189 |
+
wrap=True,
|
| 190 |
+
row_count=(5, "dynamic"),
|
| 191 |
+
column_count=(8, "fixed")
|
| 192 |
+
)
|
| 193 |
+
submit_review_btn = gr.Button("β
Submit Review", variant="secondary")
|
| 194 |
+
|
| 195 |
+
gr.Markdown("### π Topic Charts")
|
| 196 |
+
with gr.Row():
|
| 197 |
+
chart_selector = gr.Dropdown(
|
| 198 |
+
choices=["Topic Distribution", "Similarity Heatmap",
|
| 199 |
+
"Top Keywords per Topic", "Abstract vs Title Comparison"],
|
| 200 |
+
label="Select Chart",
|
| 201 |
+
value="Topic Distribution"
|
| 202 |
+
)
|
| 203 |
+
chart_display = gr.HTML(label="Chart")
|
| 204 |
+
|
| 205 |
+
def load_chart(chart_name):
|
| 206 |
+
"""Load pre-generated Plotly chart HTML from disk."""
|
| 207 |
+
import os
|
| 208 |
+
import html as _html
|
| 209 |
+
chart_map = {
|
| 210 |
+
"Topic Distribution": "outputs/chart_distribution.html",
|
| 211 |
+
"Similarity Heatmap": "outputs/chart_heatmap.html",
|
| 212 |
+
"Top Keywords per Topic": "outputs/chart_keywords.html",
|
| 213 |
+
"Abstract vs Title Comparison":"outputs/chart_comparison.html",
|
| 214 |
+
}
|
| 215 |
+
path = chart_map.get(chart_name, "")
|
| 216 |
+
if os.path.exists(path):
|
| 217 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 218 |
+
content = f.read()
|
| 219 |
+
# Embed the full HTML in an iframe via srcdoc so scripts execute
|
| 220 |
+
# Escape attribute characters but preserve the document structure.
|
| 221 |
+
srcdoc = _html.escape(content, quote=True)
|
| 222 |
+
iframe = (
|
| 223 |
+
f"<iframe srcdoc=\"{srcdoc}\" style=\"border:0; width:100%; height:700px;\"></iframe>"
|
| 224 |
+
)
|
| 225 |
+
return iframe
|
| 226 |
+
return "<p style='color:grey'>Chart not yet generated. Complete the relevant phase first.</p>"
|
| 227 |
+
|
| 228 |
+
chart_selector.change(fn=load_chart, inputs=[chart_selector], outputs=[chart_display])
|
| 229 |
+
|
| 230 |
+
gr.Markdown("### π₯ Download Outputs")
|
| 231 |
+
download_btn = gr.Button("π Refresh Download List")
|
| 232 |
+
download_files = gr.File(label="Available Output Files", file_count="multiple")
|
| 233 |
+
|
| 234 |
+
download_btn.click(fn=get_download_files, inputs=[], outputs=[download_files])
|
| 235 |
+
|
| 236 |
+
send_btn.click(
|
| 237 |
+
fn=send_message,
|
| 238 |
+
inputs=[user_input, chatbot, uploaded_path_state, thread_id_state],
|
| 239 |
+
outputs=[chatbot, user_input, review_table, phase_bar]
|
| 240 |
+
)
|
| 241 |
+
user_input.submit(
|
| 242 |
+
fn=send_message,
|
| 243 |
+
inputs=[user_input, chatbot, uploaded_path_state, thread_id_state],
|
| 244 |
+
outputs=[chatbot, user_input, review_table, phase_bar]
|
| 245 |
+
)
|
| 246 |
+
submit_review_btn.click(
|
| 247 |
+
fn=submit_review,
|
| 248 |
+
inputs=[review_table, chatbot, uploaded_path_state, thread_id_state],
|
| 249 |
+
outputs=[chatbot, review_table, phase_bar]
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
if __name__ == "__main__":
|
| 254 |
+
demo.launch(
|
| 255 |
+
share=False,
|
| 256 |
+
server_name="0.0.0.0",
|
| 257 |
+
server_port=7860,
|
| 258 |
+
theme=gr.themes.Soft(),
|
| 259 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
langchain-core
|
| 3 |
+
langchain-mistralai
|
| 4 |
+
langgraph
|
| 5 |
+
sentence-transformers
|
| 6 |
+
scikit-learn
|
| 7 |
+
bertopic
|
| 8 |
+
plotly
|
| 9 |
+
numpy
|
| 10 |
+
pandas
|
| 11 |
+
hdbscan
|
| 12 |
+
umap-learn
|
| 13 |
+
pynndescent
|
tools.py
ADDED
|
@@ -0,0 +1,544 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tools.py β 7 Stateless LangChain Tools for BERTopic Agentic Thematic Analysis
|
| 3 |
+
All tools are decorated with @tool and use handle_tool_error=True.
|
| 4 |
+
No if/elif/else, no for/while loops, no try/except blocks.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
import numpy as np
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import plotly.express as px
|
| 13 |
+
import plotly.graph_objects as go
|
| 14 |
+
from langchain_core.tools import tool
|
| 15 |
+
from sentence_transformers import SentenceTransformer
|
| 16 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 17 |
+
from sklearn.preprocessing import normalize
|
| 18 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 19 |
+
|
| 20 |
+
os.makedirs("outputs", exist_ok=True)
|
| 21 |
+
|
| 22 |
+
BOILERPLATE_PATTERNS = [
|
| 23 |
+
r"Β©\s*\d{4}.*",
|
| 24 |
+
r"all rights reserved.*",
|
| 25 |
+
r"published by elsevier.*",
|
| 26 |
+
r"this paper (proposes|presents|investigates|aims)",
|
| 27 |
+
r"in this (paper|study|article|work)",
|
| 28 |
+
r"the purpose of this (paper|study)",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _clean_text(text: str) -> str:
|
| 33 |
+
"""Remove boilerplate from a single text string."""
|
| 34 |
+
text = str(text).lower().strip()
|
| 35 |
+
cleaned = re.sub("|".join(BOILERPLATE_PATTERNS), "", text, flags=re.IGNORECASE)
|
| 36 |
+
return cleaned.strip()
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _split_sentences(text: str) -> list:
|
| 40 |
+
"""Split text into sentences on '. ', '? ', '! '."""
|
| 41 |
+
raw = re.split(r"(?<=[.!?])\s+", str(text).strip())
|
| 42 |
+
return list(filter(lambda s: len(s.split()) > 4, raw))
|
| 43 |
+
|
| 44 |
+
@tool
|
| 45 |
+
def load_scopus_csv(file_path: str) -> str:
|
| 46 |
+
"""
|
| 47 |
+
Load a Scopus CSV file and return a summary of its contents.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
file_path: Absolute or relative path to the Scopus CSV file.
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
JSON string with keys: papers, abstract_sentences, title_sentences,
|
| 54 |
+
columns, sample_titles, status.
|
| 55 |
+
"""
|
| 56 |
+
df = pd.read_csv(file_path, encoding="utf-8-sig")
|
| 57 |
+
|
| 58 |
+
missing_columns = list(filter(lambda col: col not in df.columns, ["Title", "Abstract"]))
|
| 59 |
+
if missing_columns:
|
| 60 |
+
return json.dumps({
|
| 61 |
+
"status": "error",
|
| 62 |
+
"message": f"Missing required columns: {', '.join(missing_columns)}",
|
| 63 |
+
"columns": list(df.columns),
|
| 64 |
+
}, indent=2)
|
| 65 |
+
|
| 66 |
+
# Keep only rows with non-empty Title and Abstract
|
| 67 |
+
df = df[df["Title"].notna() & df["Abstract"].notna()].reset_index(drop=True)
|
| 68 |
+
df["Abstract_Clean"] = df["Abstract"].map(_clean_text)
|
| 69 |
+
df["Title_Clean"] = df["Title"].map(_clean_text)
|
| 70 |
+
|
| 71 |
+
abstract_sentences = sum(df["Abstract_Clean"].map(_split_sentences).map(len))
|
| 72 |
+
title_sentences = sum(df["Title_Clean"].map(_split_sentences).map(len))
|
| 73 |
+
|
| 74 |
+
df.to_json("outputs/cleaned_data.json", orient="records", indent=2)
|
| 75 |
+
|
| 76 |
+
return json.dumps({
|
| 77 |
+
"status": "loaded",
|
| 78 |
+
"papers": int(len(df)),
|
| 79 |
+
"abstract_sentences": int(abstract_sentences),
|
| 80 |
+
"title_sentences": int(title_sentences),
|
| 81 |
+
"columns": list(df.columns),
|
| 82 |
+
"sample_titles": list(df["Title"].head(5)),
|
| 83 |
+
}, indent=2)
|
| 84 |
+
|
| 85 |
+
@tool
|
| 86 |
+
def run_bertopic_discovery(run_config: str) -> str:
|
| 87 |
+
"""
|
| 88 |
+
Embed sentences, cluster with AgglomerativeClustering (cosine, threshold=0.7),
|
| 89 |
+
extract top-5 evidence sentences per cluster, generate Plotly charts, and
|
| 90 |
+
save summaries.json and embeddings.npy.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
run_config: JSON string with key 'columns' β list of column names to use,
|
| 94 |
+
e.g. '{"columns": ["Abstract"]}' or '{"columns": ["Title"]}'
|
| 95 |
+
or '{"columns": ["Abstract", "Title"]}'.
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
JSON string summarising clusters found.
|
| 99 |
+
"""
|
| 100 |
+
config = json.loads(run_config)
|
| 101 |
+
columns = config.get("columns", ["Abstract"])
|
| 102 |
+
tag = "_".join(columns).lower()
|
| 103 |
+
|
| 104 |
+
cleaned_data_path = "outputs/cleaned_data.json"
|
| 105 |
+
if not os.path.exists(cleaned_data_path):
|
| 106 |
+
return json.dumps({
|
| 107 |
+
"status": "error",
|
| 108 |
+
"message": "Cleaned data file not found. Run load_scopus_csv first.",
|
| 109 |
+
}, indent=2)
|
| 110 |
+
|
| 111 |
+
df = pd.read_json(cleaned_data_path)
|
| 112 |
+
col_map = {"Abstract": "Abstract_Clean", "Title": "Title_Clean"}
|
| 113 |
+
use_cols = list(map(lambda c: col_map.get(c, c), columns))
|
| 114 |
+
missing_columns = list(filter(lambda c: c not in df.columns, use_cols))
|
| 115 |
+
if missing_columns:
|
| 116 |
+
return json.dumps({
|
| 117 |
+
"status": "error",
|
| 118 |
+
"message": f"Missing cleaned columns: {', '.join(missing_columns)}",
|
| 119 |
+
"available_columns": list(df.columns),
|
| 120 |
+
}, indent=2)
|
| 121 |
+
|
| 122 |
+
# Collect (sentence, paper_index) pairs
|
| 123 |
+
pairs = []
|
| 124 |
+
def _extract(row_tuple):
|
| 125 |
+
idx, row = row_tuple
|
| 126 |
+
return list(map(lambda s: (s, idx),
|
| 127 |
+
_split_sentences(" ".join(str(row[c]) for c in use_cols))))
|
| 128 |
+
|
| 129 |
+
all_pairs = sum(map(_extract, df.iterrows()), [])
|
| 130 |
+
sentences = list(map(lambda p: p[0], all_pairs))
|
| 131 |
+
paper_ids = list(map(lambda p: p[1], all_pairs))
|
| 132 |
+
|
| 133 |
+
if not sentences:
|
| 134 |
+
empty_summaries_path = f"outputs/summaries_{tag}.json"
|
| 135 |
+
with open(empty_summaries_path, "w", encoding="utf-8") as f:
|
| 136 |
+
json.dump([], f, indent=2)
|
| 137 |
+
return json.dumps({
|
| 138 |
+
"status": "completed",
|
| 139 |
+
"tag": tag,
|
| 140 |
+
"n_clusters": 0,
|
| 141 |
+
"total_sentences": 0,
|
| 142 |
+
"summaries_file": empty_summaries_path,
|
| 143 |
+
"message": "No sentences available after preprocessing.",
|
| 144 |
+
}, indent=2)
|
| 145 |
+
|
| 146 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 147 |
+
embeddings = model.encode(sentences, show_progress_bar=True, batch_size=64)
|
| 148 |
+
# Convert to float32 and L2-normalise in-place to avoid large float64 copies
|
| 149 |
+
embeddings = np.asarray(embeddings, dtype=np.float32)
|
| 150 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 151 |
+
embeddings = embeddings / (norms + 1e-12)
|
| 152 |
+
embeddings = embeddings.astype(np.float32, copy=False)
|
| 153 |
+
np.save(f"outputs/embeddings_{tag}.npy", embeddings)
|
| 154 |
+
|
| 155 |
+
clusterer = AgglomerativeClustering(
|
| 156 |
+
n_clusters=None,
|
| 157 |
+
metric="cosine",
|
| 158 |
+
linkage="average",
|
| 159 |
+
distance_threshold=0.3 # cosine distance = 1 β similarity; 0.3 β similarity 0.7
|
| 160 |
+
)
|
| 161 |
+
labels = clusterer.fit_predict(embeddings)
|
| 162 |
+
n_clusters = int(max(labels) + 1)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def _summarise_cluster(cid):
|
| 166 |
+
mask = np.where(np.array(labels) == cid)[0]
|
| 167 |
+
vecs = embeddings[mask]
|
| 168 |
+
if vecs.size == 0:
|
| 169 |
+
top_sents = []
|
| 170 |
+
top_pids = []
|
| 171 |
+
size = 0
|
| 172 |
+
else:
|
| 173 |
+
centroid = vecs.mean(axis=0, keepdims=True)
|
| 174 |
+
sims = cosine_similarity(centroid, vecs)[0]
|
| 175 |
+
top5_idx = mask[np.argsort(sims)[::-1][:5]]
|
| 176 |
+
top_sents = list(map(lambda i: sentences[i], top5_idx))
|
| 177 |
+
top_pids = list(sorted(set(map(lambda i: int(paper_ids[i]), top5_idx))))
|
| 178 |
+
size = int(len(mask))
|
| 179 |
+
return {
|
| 180 |
+
"cluster_id": cid,
|
| 181 |
+
"size": size,
|
| 182 |
+
"papers": top_pids,
|
| 183 |
+
"top_sentences": top_sents,
|
| 184 |
+
"label": f"Cluster_{cid}",
|
| 185 |
+
"approved": False,
|
| 186 |
+
"rename_to": "",
|
| 187 |
+
"reasoning": "",
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
summaries = list(map(_summarise_cluster, range(n_clusters)))
|
| 191 |
+
with open(f"outputs/summaries_{tag}.json", "w") as f:
|
| 192 |
+
json.dump(summaries, f, indent=2)
|
| 193 |
+
|
| 194 |
+
sizes = list(map(lambda s: s["size"], summaries))
|
| 195 |
+
cids = list(map(lambda s: f"C{s['cluster_id']}", summaries))
|
| 196 |
+
|
| 197 |
+
fig_dist = px.bar(x=cids, y=sizes, labels={"x": "Cluster", "y": "Sentences"},
|
| 198 |
+
title=f"Topic Distribution ({tag})", color=sizes,
|
| 199 |
+
color_continuous_scale="Blues")
|
| 200 |
+
fig_dist.write_html("outputs/chart_distribution.html")
|
| 201 |
+
|
| 202 |
+
# Build centroids (one vector per cluster) using float32 to reduce memory
|
| 203 |
+
centroids = []
|
| 204 |
+
emb_arr = embeddings
|
| 205 |
+
labels_arr = np.array(labels)
|
| 206 |
+
for s in summaries:
|
| 207 |
+
mask = np.where(labels_arr == s["cluster_id"])[0]
|
| 208 |
+
if mask.size == 0:
|
| 209 |
+
centroids.append(np.zeros((emb_arr.shape[1],), dtype=np.float32))
|
| 210 |
+
else:
|
| 211 |
+
centroids.append(emb_arr[mask].mean(axis=0).astype(np.float32))
|
| 212 |
+
centroids = np.vstack(centroids).astype(np.float32)
|
| 213 |
+
|
| 214 |
+
# Avoid computing an enormous n_clusters x n_clusters heatmap which can OOM.
|
| 215 |
+
HEATMAP_MAX = 300
|
| 216 |
+
if centroids.shape[0] > HEATMAP_MAX:
|
| 217 |
+
with open("outputs/chart_heatmap.html", "w", encoding="utf-8") as f:
|
| 218 |
+
f.write(f"<p style='color:grey'>Heatmap skipped: {centroids.shape[0]} clusters exceeds safe limit ({HEATMAP_MAX}).</p>")
|
| 219 |
+
else:
|
| 220 |
+
sim_matrix = cosine_similarity(centroids.astype(np.float32))
|
| 221 |
+
fig_heat = go.Figure(go.Heatmap(z=sim_matrix, x=cids, y=cids,
|
| 222 |
+
colorscale="Viridis"))
|
| 223 |
+
fig_heat.update_layout(title=f"Cluster Similarity Heatmap ({tag})")
|
| 224 |
+
fig_heat.write_html("outputs/chart_heatmap.html")
|
| 225 |
+
|
| 226 |
+
return json.dumps({
|
| 227 |
+
"status": "completed",
|
| 228 |
+
"tag": tag,
|
| 229 |
+
"n_clusters": n_clusters,
|
| 230 |
+
"total_sentences": len(sentences),
|
| 231 |
+
"summaries_file": f"outputs/summaries_{tag}.json",
|
| 232 |
+
}, indent=2)
|
| 233 |
+
|
| 234 |
+
@tool
|
| 235 |
+
def label_topics_with_llm(labelling_input: str) -> str:
|
| 236 |
+
"""
|
| 237 |
+
Use the LLM to generate a human-readable label, category, confidence score,
|
| 238 |
+
and reasoning for each cluster based on its top evidence sentences.
|
| 239 |
+
|
| 240 |
+
Args:
|
| 241 |
+
labelling_input: JSON string with keys:
|
| 242 |
+
- 'tag': run tag (e.g. 'abstract' or 'title')
|
| 243 |
+
- 'llm_labels': list of dicts, each with keys
|
| 244 |
+
'cluster_id', 'label', 'category', 'confidence', 'reasoning'
|
| 245 |
+
as returned by the LLM's own analysis.
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
JSON string confirming labels saved.
|
| 249 |
+
"""
|
| 250 |
+
data = json.loads(labelling_input)
|
| 251 |
+
tag = data.get("tag", "abstract")
|
| 252 |
+
llm_labels = data.get("llm_labels", [])
|
| 253 |
+
|
| 254 |
+
summaries_path = f"outputs/summaries_{tag}.json"
|
| 255 |
+
with open(summaries_path) as f:
|
| 256 |
+
summaries = json.load(f)
|
| 257 |
+
|
| 258 |
+
label_map = {item["cluster_id"]: item for item in llm_labels}
|
| 259 |
+
|
| 260 |
+
def _apply_label(s):
|
| 261 |
+
update = label_map.get(s["cluster_id"], {})
|
| 262 |
+
return {**s,
|
| 263 |
+
"label": update.get("label", s["label"]),
|
| 264 |
+
"category": update.get("category", ""),
|
| 265 |
+
"confidence": update.get("confidence", 0.0),
|
| 266 |
+
"reasoning": update.get("reasoning", "")}
|
| 267 |
+
|
| 268 |
+
updated = list(map(_apply_label, summaries))
|
| 269 |
+
with open(summaries_path, "w") as f:
|
| 270 |
+
json.dump(updated, f, indent=2)
|
| 271 |
+
|
| 272 |
+
return json.dumps({
|
| 273 |
+
"status": "labelled",
|
| 274 |
+
"tag": tag,
|
| 275 |
+
"topics_labelled": len(updated),
|
| 276 |
+
}, indent=2)
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 280 |
+
# TOOL 4 β consolidate_into_themes
|
| 281 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 282 |
+
@tool
|
| 283 |
+
def consolidate_into_themes(consolidation_input: str) -> str:
|
| 284 |
+
"""
|
| 285 |
+
Merge approved clusters into final themes based on user review table.
|
| 286 |
+
Recomputes merged centroids and saves themes.json.
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
consolidation_input: JSON string with keys:
|
| 290 |
+
- 'tag': run tag
|
| 291 |
+
- 'approvals': list of dicts with keys
|
| 292 |
+
'cluster_id', 'approved' (bool), 'rename_to' (str), 'reasoning' (str)
|
| 293 |
+
|
| 294 |
+
Returns:
|
| 295 |
+
JSON string summarising final themes.
|
| 296 |
+
"""
|
| 297 |
+
data = json.loads(consolidation_input)
|
| 298 |
+
tag = data.get("tag", "abstract")
|
| 299 |
+
approvals = data.get("approvals", [])
|
| 300 |
+
|
| 301 |
+
summaries_path = f"outputs/summaries_{tag}.json"
|
| 302 |
+
with open(summaries_path) as f:
|
| 303 |
+
summaries = json.load(f)
|
| 304 |
+
|
| 305 |
+
approval_map = {a["cluster_id"]: a for a in approvals}
|
| 306 |
+
|
| 307 |
+
def _apply_approval(s):
|
| 308 |
+
a = approval_map.get(s["cluster_id"], {})
|
| 309 |
+
return {**s,
|
| 310 |
+
"approved": a.get("approved", False),
|
| 311 |
+
"rename_to": a.get("rename_to", ""),
|
| 312 |
+
"reasoning": a.get("reasoning", "")}
|
| 313 |
+
|
| 314 |
+
updated = list(map(_apply_approval, summaries))
|
| 315 |
+
approved = list(filter(lambda s: s["approved"], updated))
|
| 316 |
+
|
| 317 |
+
def _finalise(s):
|
| 318 |
+
final_label = s["rename_to"].strip() if s["rename_to"].strip() else s["label"]
|
| 319 |
+
return {**s, "final_label": final_label}
|
| 320 |
+
|
| 321 |
+
themes = list(map(_finalise, approved))
|
| 322 |
+
|
| 323 |
+
with open(f"outputs/themes_{tag}.json", "w") as f:
|
| 324 |
+
json.dump(themes, f, indent=2)
|
| 325 |
+
|
| 326 |
+
# ββ Keyword chart per theme ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 327 |
+
from collections import Counter
|
| 328 |
+
stop = {"the","a","an","of","in","and","to","is","for","with","that","this","on","are",
|
| 329 |
+
"by","as","from","be","was","at","it","or","has","have","been","which","their"}
|
| 330 |
+
|
| 331 |
+
def _top_words(s):
|
| 332 |
+
words = re.findall(r"\b[a-z]{4,}\b",
|
| 333 |
+
" ".join(s.get("top_sentences", [])).lower())
|
| 334 |
+
filtered = list(filter(lambda w: w not in stop, words))
|
| 335 |
+
counted = Counter(filtered).most_common(5)
|
| 336 |
+
return list(map(lambda kv: {"theme": s["final_label"],
|
| 337 |
+
"word": kv[0], "count": kv[1]}, counted))
|
| 338 |
+
|
| 339 |
+
kw_rows = sum(map(_top_words, themes), [])
|
| 340 |
+
kw_df = pd.DataFrame(kw_rows)
|
| 341 |
+
|
| 342 |
+
if len(kw_df) > 0:
|
| 343 |
+
fig_kw = px.bar(kw_df, x="count", y="word", color="theme",
|
| 344 |
+
orientation="h", title="Top Keywords per Theme",
|
| 345 |
+
barmode="group")
|
| 346 |
+
fig_kw.write_html("outputs/chart_keywords.html")
|
| 347 |
+
else:
|
| 348 |
+
with open("outputs/chart_keywords.html", "w", encoding="utf-8") as f:
|
| 349 |
+
f.write("<p style='color:grey'>No approved themes available yet.</p>")
|
| 350 |
+
|
| 351 |
+
return json.dumps({
|
| 352 |
+
"status": "consolidated",
|
| 353 |
+
"tag": tag,
|
| 354 |
+
"themes_count": len(themes),
|
| 355 |
+
"themes": list(map(lambda t: t["final_label"], themes)),
|
| 356 |
+
}, indent=2)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 360 |
+
# TOOL 5 β compare_with_taxonomy
|
| 361 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 362 |
+
@tool
|
| 363 |
+
def compare_with_taxonomy(taxonomy_input: str) -> str:
|
| 364 |
+
"""
|
| 365 |
+
Map final themes to the PAJAIS taxonomy. Identify MAPPED vs NOVEL themes.
|
| 366 |
+
|
| 367 |
+
Args:
|
| 368 |
+
taxonomy_input: JSON string with keys:
|
| 369 |
+
- 'tag': run tag
|
| 370 |
+
- 'mappings': list of dicts with keys
|
| 371 |
+
'final_label', 'pajais_category' (str or ''), 'mapped' (bool)
|
| 372 |
+
|
| 373 |
+
Returns:
|
| 374 |
+
JSON string with mapping results saved to taxonomy_mapping.json.
|
| 375 |
+
"""
|
| 376 |
+
PAJAIS_TAXONOMY = [
|
| 377 |
+
"IS Strategy & Governance", "AI & Machine Learning Applications",
|
| 378 |
+
"Digital Transformation", "Human-Computer Interaction",
|
| 379 |
+
"Knowledge Management", "Information Security & Privacy",
|
| 380 |
+
"Business Intelligence & Analytics", "Enterprise Systems",
|
| 381 |
+
"E-Commerce & Digital Markets", "IT Adoption & Acceptance",
|
| 382 |
+
"Social Media & Collaboration", "Healthcare IS",
|
| 383 |
+
"IS Research Methods", "Emerging Technologies",
|
| 384 |
+
]
|
| 385 |
+
|
| 386 |
+
data = json.loads(taxonomy_input)
|
| 387 |
+
tag = data.get("tag", "abstract")
|
| 388 |
+
mappings = data.get("mappings", [])
|
| 389 |
+
|
| 390 |
+
themes_path = f"outputs/themes_{tag}.json"
|
| 391 |
+
with open(themes_path) as f:
|
| 392 |
+
themes = json.load(f)
|
| 393 |
+
|
| 394 |
+
mapping_map = {m["final_label"]: m for m in mappings}
|
| 395 |
+
|
| 396 |
+
def _map_theme(t):
|
| 397 |
+
m = mapping_map.get(t["final_label"], {})
|
| 398 |
+
status = "MAPPED" if m.get("mapped", False) else "NOVEL"
|
| 399 |
+
return {**t,
|
| 400 |
+
"pajais_category": m.get("pajais_category", ""),
|
| 401 |
+
"mapping_status": status}
|
| 402 |
+
|
| 403 |
+
mapped_themes = list(map(_map_theme, themes))
|
| 404 |
+
|
| 405 |
+
with open(f"outputs/taxonomy_mapping_{tag}.json", "w") as f:
|
| 406 |
+
json.dump(mapped_themes, f, indent=2)
|
| 407 |
+
|
| 408 |
+
mapped_count = len(list(filter(lambda t: t["mapping_status"] == "MAPPED", mapped_themes)))
|
| 409 |
+
novel_count = len(mapped_themes) - mapped_count
|
| 410 |
+
|
| 411 |
+
return json.dumps({
|
| 412 |
+
"status": "mapped",
|
| 413 |
+
"tag": tag,
|
| 414 |
+
"total_themes": len(mapped_themes),
|
| 415 |
+
"mapped_count": mapped_count,
|
| 416 |
+
"novel_count": novel_count,
|
| 417 |
+
"pajais_taxonomy": PAJAIS_TAXONOMY,
|
| 418 |
+
"output_file": f"outputs/taxonomy_mapping_{tag}.json",
|
| 419 |
+
}, indent=2)
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 423 |
+
# TOOL 6 β generate_comparison_csv
|
| 424 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 425 |
+
@tool
|
| 426 |
+
def generate_comparison_csv(comparison_input: str) -> str:
|
| 427 |
+
"""
|
| 428 |
+
Compare Abstract-derived themes vs Title-derived themes.
|
| 429 |
+
Produce a side-by-side CSV and a Plotly comparison chart.
|
| 430 |
+
|
| 431 |
+
Args:
|
| 432 |
+
comparison_input: JSON string with key 'tags' β list of two run tags,
|
| 433 |
+
e.g. '{"tags": ["abstract", "title"]}'.
|
| 434 |
+
|
| 435 |
+
Returns:
|
| 436 |
+
JSON string with path to comparison CSV.
|
| 437 |
+
"""
|
| 438 |
+
data = json.loads(comparison_input)
|
| 439 |
+
tags = data.get("tags", ["abstract", "title"])
|
| 440 |
+
|
| 441 |
+
def _load_themes(tag):
|
| 442 |
+
path = f"outputs/themes_{tag}.json"
|
| 443 |
+
with open(path) as f:
|
| 444 |
+
themes = json.load(f)
|
| 445 |
+
return list(map(lambda t: {
|
| 446 |
+
"tag": tag,
|
| 447 |
+
"final_label": t["final_label"],
|
| 448 |
+
"size": t["size"],
|
| 449 |
+
"papers": len(t.get("papers", [])),
|
| 450 |
+
}, themes))
|
| 451 |
+
|
| 452 |
+
all_rows = sum(map(_load_themes, tags), [])
|
| 453 |
+
df = pd.DataFrame(all_rows)
|
| 454 |
+
df.to_csv("outputs/theme_comparison.csv", index=False)
|
| 455 |
+
|
| 456 |
+
if len(df) > 0:
|
| 457 |
+
fig = px.bar(df, x="final_label", y="size", color="tag", barmode="group",
|
| 458 |
+
title="Abstract vs Title Theme Comparison",
|
| 459 |
+
labels={"final_label": "Theme", "size": "Sentences", "tag": "Source"})
|
| 460 |
+
fig.write_html("outputs/chart_comparison.html")
|
| 461 |
+
else:
|
| 462 |
+
with open("outputs/chart_comparison.html", "w", encoding="utf-8") as f:
|
| 463 |
+
f.write("<p style='color:grey'>No theme comparison available yet.</p>")
|
| 464 |
+
|
| 465 |
+
return json.dumps({
|
| 466 |
+
"status": "comparison_generated",
|
| 467 |
+
"csv_path": "outputs/theme_comparison.csv",
|
| 468 |
+
"chart_path": "outputs/chart_comparison.html",
|
| 469 |
+
"total_rows": len(df),
|
| 470 |
+
}, indent=2)
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 474 |
+
# TOOL 7 β export_narrative
|
| 475 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 476 |
+
@tool
|
| 477 |
+
def export_narrative(narrative_input: str) -> str:
|
| 478 |
+
"""
|
| 479 |
+
Generate a ~500-word Section 7 narrative report summarising all themes,
|
| 480 |
+
their PAJAIS mapping, and key insights. Save as narrative_report.txt.
|
| 481 |
+
|
| 482 |
+
Args:
|
| 483 |
+
narrative_input: JSON string with keys:
|
| 484 |
+
- 'tag': run tag to base report on
|
| 485 |
+
- 'narrative': the 500-word narrative text (written by the LLM)
|
| 486 |
+
- 'researcher_name': optional researcher name
|
| 487 |
+
|
| 488 |
+
Returns:
|
| 489 |
+
JSON string confirming report saved.
|
| 490 |
+
"""
|
| 491 |
+
data = json.loads(narrative_input)
|
| 492 |
+
tag = data.get("tag", "abstract")
|
| 493 |
+
narrative_text = data.get("narrative", "")
|
| 494 |
+
researcher_name = data.get("researcher_name", "Researcher")
|
| 495 |
+
|
| 496 |
+
# Auto-trim narrative to a maximum word count to avoid oversized reports
|
| 497 |
+
try:
|
| 498 |
+
max_words = int(data.get("max_words", 500))
|
| 499 |
+
except Exception:
|
| 500 |
+
max_words = 500
|
| 501 |
+
words = narrative_text.split()
|
| 502 |
+
trimmed = False
|
| 503 |
+
if len(words) > max_words:
|
| 504 |
+
narrative_text = " ".join(words[:max_words]).rstrip() + " ..."
|
| 505 |
+
trimmed = True
|
| 506 |
+
|
| 507 |
+
mapping_path = f"outputs/taxonomy_mapping_{tag}.json"
|
| 508 |
+
with open(mapping_path) as f:
|
| 509 |
+
themes = json.load(f)
|
| 510 |
+
|
| 511 |
+
theme_lines = list(map(
|
| 512 |
+
lambda t: f" β’ {t['final_label']} [{t.get('mapping_status','?')}]"
|
| 513 |
+
f" β PAJAIS: {t.get('pajais_category','N/A')}",
|
| 514 |
+
themes
|
| 515 |
+
))
|
| 516 |
+
|
| 517 |
+
full_report = "\n".join([
|
| 518 |
+
"=" * 60,
|
| 519 |
+
"SECTION 7: THEMATIC ANALYSIS NARRATIVE REPORT",
|
| 520 |
+
f"Researcher: {researcher_name}",
|
| 521 |
+
f"Source: {tag.upper()} columns",
|
| 522 |
+
"=" * 60,
|
| 523 |
+
"",
|
| 524 |
+
narrative_text,
|
| 525 |
+
"",
|
| 526 |
+
"β" * 60,
|
| 527 |
+
"THEME SUMMARY TABLE",
|
| 528 |
+
"β" * 60,
|
| 529 |
+
"\n".join(theme_lines),
|
| 530 |
+
"",
|
| 531 |
+
"=" * 60,
|
| 532 |
+
])
|
| 533 |
+
|
| 534 |
+
report_path = "outputs/narrative_report.txt"
|
| 535 |
+
with open(report_path, "w", encoding="utf-8") as f:
|
| 536 |
+
f.write(full_report)
|
| 537 |
+
|
| 538 |
+
return json.dumps({
|
| 539 |
+
"status": "report_saved",
|
| 540 |
+
"report_path": report_path,
|
| 541 |
+
"word_count": len(narrative_text.split()),
|
| 542 |
+
"trimmed": trimmed,
|
| 543 |
+
"themes_in_report": len(themes),
|
| 544 |
+
}, indent=2)
|