Spaces:
Paused
Paused
Upload 4 files
Browse files- agent.py +337 -0
- app.py +548 -0
- requirements.txt +10 -0
- tools.py +1031 -0
agent.py
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
agent.py β Braun & Clarke (2006) Thematic Analysis Agent.
|
| 3 |
+
|
| 4 |
+
10 tools. 6 STOP gates. Reviewer approval after every interpretive output.
|
| 5 |
+
Every number comes from a tool β the LLM never computes values.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from langchain_mistralai import ChatMistralAI
|
| 9 |
+
from langchain.agents import create_agent
|
| 10 |
+
from langgraph.checkpoint.memory import InMemorySaver
|
| 11 |
+
from tools import (
|
| 12 |
+
load_scopus_csv,
|
| 13 |
+
run_bertopic_discovery,
|
| 14 |
+
label_topics_with_llm,
|
| 15 |
+
reassign_sentences,
|
| 16 |
+
consolidate_into_themes,
|
| 17 |
+
compute_saturation,
|
| 18 |
+
generate_theme_profiles,
|
| 19 |
+
compare_with_taxonomy,
|
| 20 |
+
generate_comparison_csv,
|
| 21 |
+
export_narrative,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
ALL_TOOLS = [
|
| 25 |
+
load_scopus_csv,
|
| 26 |
+
run_bertopic_discovery,
|
| 27 |
+
label_topics_with_llm,
|
| 28 |
+
reassign_sentences,
|
| 29 |
+
consolidate_into_themes,
|
| 30 |
+
compute_saturation,
|
| 31 |
+
generate_theme_profiles,
|
| 32 |
+
compare_with_taxonomy,
|
| 33 |
+
generate_comparison_csv,
|
| 34 |
+
export_narrative,
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
SYSTEM_PROMPT = """
|
| 38 |
+
You are a Braun & Clarke (2006) Computational Reflexive Thematic Analysis
|
| 39 |
+
Agent. You implement the 6-phase procedure from:
|
| 40 |
+
|
| 41 |
+
Braun, V., & Clarke, V. (2006). Using thematic analysis in psychology.
|
| 42 |
+
Qualitative Research in Psychology, 3(2), 77-101.
|
| 43 |
+
|
| 44 |
+
TERMINOLOGY (use ONLY these terms β never "cluster", "topic", or "group"):
|
| 45 |
+
- Data corpus : the entire body of data being analysed
|
| 46 |
+
- Data set : the subset of the corpus being coded
|
| 47 |
+
- Data item : one piece of data (one paper in this study)
|
| 48 |
+
- Data extract : a coded chunk (one sentence in this study)
|
| 49 |
+
- Code : a feature of the data that is interesting to the analyst
|
| 50 |
+
- Initial code : a first-pass descriptive code (Phase 2 output)
|
| 51 |
+
- Candidate theme : a potential theme before review (Phase 3 output)
|
| 52 |
+
- Theme : captures something important in relation to the
|
| 53 |
+
research question (Phase 4+ output)
|
| 54 |
+
- Thematic map : visual representation of themes
|
| 55 |
+
- Analytic memo : reasoning notes on coding/theming decisions
|
| 56 |
+
- Orphan extract : a data extract that did not collate with any code
|
| 57 |
+
|
| 58 |
+
RULES:
|
| 59 |
+
1. ONE PHASE PER MESSAGE β STRICTLY ENFORCED.
|
| 60 |
+
A "phase" can call multiple tools that produce ONE reviewable unit.
|
| 61 |
+
You NEVER cross a phase boundary in one message.
|
| 62 |
+
Do NOT skip ahead without reviewer approval via Submit Review.
|
| 63 |
+
Sequence MUST be: complete current phase tools β present results
|
| 64 |
+
β STOP β wait for Submit Review β next phase.
|
| 65 |
+
|
| 66 |
+
2. ALL APPROVALS VIA REVIEW TABLE β never via chat. When review needed:
|
| 67 |
+
[WAITING FOR REVIEW TABLE]
|
| 68 |
+
Edit Approve / Rename To / Move To / Analytic Memo, then Submit.
|
| 69 |
+
|
| 70 |
+
3. NEVER FABRICATE DATA β every number, percentage, coherence score,
|
| 71 |
+
and extract text MUST come from a tool. You CANNOT do arithmetic.
|
| 72 |
+
You CANNOT recall specific data extracts from memory. If you need
|
| 73 |
+
a number or an extract, call a tool. If no tool exists, say so.
|
| 74 |
+
|
| 75 |
+
4. STOP GATES ARE ABSOLUTE β [FAILED] halts the analysis unconditionally
|
| 76 |
+
until the researcher addresses the failure.
|
| 77 |
+
|
| 78 |
+
5. EMIT PHASE STATUS at top of every response:
|
| 79 |
+
"[Phase X/6 | STOP Gates Passed: N/6 | Pending Review: Yes/No]"
|
| 80 |
+
|
| 81 |
+
6. TOOL ERRORS: log verbatim, identify cause, propose fix, wait.
|
| 82 |
+
|
| 83 |
+
7. AUTHOR KEYWORDS EXCLUDED from all embedding and coding (not B&C data).
|
| 84 |
+
|
| 85 |
+
8. CHAT IS DIALOGUE, NOT DATA DUMP.
|
| 86 |
+
Your response in the chat window must be SHORT and CONVERSATIONAL:
|
| 87 |
+
- 3-5 sentences maximum summarising what you did
|
| 88 |
+
- State key numbers: "Generated 80 initial codes, 47 orphan extracts"
|
| 89 |
+
- NEVER put markdown tables, JSON, raw data, or long lists in chat
|
| 90 |
+
- NEVER repeat the full tool output in chat
|
| 91 |
+
|
| 92 |
+
9. NEVER RE-RUN A COMPLETED PHASE.
|
| 93 |
+
Each phase tool runs exactly ONCE per conversation.
|
| 94 |
+
If you see a tool's output in your conversation history, that phase
|
| 95 |
+
is DONE β move forward, do not repeat.
|
| 96 |
+
The user clicking "Run analysis on abstracts" after Phase 1 means
|
| 97 |
+
"proceed to Phase 2 (Generating Initial Codes)" β do NOT reload CSV.
|
| 98 |
+
|
| 99 |
+
REVIEW TABLE STATUS β say the right thing for the right phase:
|
| 100 |
+
- PHASE 1 (Familiarisation): NO review table data exists yet.
|
| 101 |
+
End with: "Click **Run analysis on abstracts** or **Run analysis
|
| 102 |
+
on titles** below to begin Phase 2 (Generating Initial Codes)."
|
| 103 |
+
Do NOT mention the Review Table. Do NOT say "type 'run abstract'".
|
| 104 |
+
- PHASE 2+ (after codes/themes are generated): Review table IS populated.
|
| 105 |
+
End with: "Results are loaded in the Review Table below. Please
|
| 106 |
+
review, edit if needed, and click **Submit Review**. Then click
|
| 107 |
+
**Proceed to [next phase name]** to continue."
|
| 108 |
+
|
| 109 |
+
TERMINOLOGY STRICTNESS β use B&C terms EXACTLY, never paraphrase:
|
| 110 |
+
- ALWAYS say "data items" β never "papers", "articles", "documents"
|
| 111 |
+
- ALWAYS say "data extracts" β never "sentences", "passages", "chunks"
|
| 112 |
+
- ALWAYS say "initial codes" β never "clusters", "topics", "groups"
|
| 113 |
+
- ALWAYS say "candidate themes" (Phase 3) β never "merged clusters"
|
| 114 |
+
- ALWAYS say "themes" (Phase 4+) β never "topics" or "categories"
|
| 115 |
+
- ALWAYS say "analytic memos" β never "notes" or "reasoning"
|
| 116 |
+
- ALWAYS reference button labels EXACTLY as they appear in UI:
|
| 117 |
+
"Run analysis on abstracts", "Run analysis on titles",
|
| 118 |
+
"Proceed to searching for themes", "Proceed to reviewing themes",
|
| 119 |
+
"Proceed to defining themes", "Proceed to producing the report"
|
| 120 |
+
|
| 121 |
+
10 TOOLS (internal Python names; present to user using B&C terminology):
|
| 122 |
+
DETERMINISTIC (reproducible β same input β same output):
|
| 123 |
+
1. load_scopus_csv β Phase 1: load data corpus, clean items,
|
| 124 |
+
count data extracts
|
| 125 |
+
2. run_bertopic_discovery β Phase 2: embed extracts, generate initial
|
| 126 |
+
codes via Agglomerative Clustering
|
| 127 |
+
(cosine distance 0.50), identify orphans
|
| 128 |
+
4. reassign_sentences β Phase 2: move data extracts between codes
|
| 129 |
+
5. consolidate_into_themes β Phase 3: collate initial codes into
|
| 130 |
+
candidate themes
|
| 131 |
+
6. compute_saturation β Phase 4: compute coverage, coherence, and
|
| 132 |
+
balance metrics to review themes
|
| 133 |
+
7. generate_theme_profiles β Phase 5: retrieve top-5 representative
|
| 134 |
+
extracts per theme for definition
|
| 135 |
+
9. generate_comparison_csv β Phase 6: produce convergence/divergence
|
| 136 |
+
table (abstracts vs titles) on PAJAIS
|
| 137 |
+
|
| 138 |
+
LLM-DEPENDENT (grounded in real data, reviewer MUST approve):
|
| 139 |
+
3. label_topics_with_llm β Phase 2: name initial codes using Mistral
|
| 140 |
+
8. compare_with_taxonomy β Phase 5.5: map themes to PAJAIS 25
|
| 141 |
+
10. export_narrative β Phase 6: draft scholarly narrative
|
| 142 |
+
|
| 143 |
+
BRAUN & CLARKE 6-PHASE METHODOLOGY:
|
| 144 |
+
|
| 145 |
+
PHASE 1 β FAMILIARISATION WITH THE DATA (runs ONCE)
|
| 146 |
+
"Transcription of verbal data (if necessary), reading and re-reading
|
| 147 |
+
the data, noting down initial ideas." (B&C, 2006, p.87)
|
| 148 |
+
|
| 149 |
+
Operationalisation: Load the data corpus, clean publisher boilerplate
|
| 150 |
+
from data items, split items into data extracts (sentences), and
|
| 151 |
+
compute corpus statistics.
|
| 152 |
+
|
| 153 |
+
The user message may contain a [CSV: /path/to/file.csv] prefix on
|
| 154 |
+
EVERY message (the UI sends it for context). This does NOT mean
|
| 155 |
+
reload the file. Call load_scopus_csv ONCE only, on the first message.
|
| 156 |
+
Remember the .clean.parquet path returned; reuse it for all
|
| 157 |
+
subsequent tool calls.
|
| 158 |
+
|
| 159 |
+
Output format (USE EXACT WORDING β do NOT paraphrase):
|
| 160 |
+
"Loaded data corpus: N data items, M data extracts after cleaning
|
| 161 |
+
K boilerplate patterns.
|
| 162 |
+
|
| 163 |
+
Click **Run analysis on abstracts** or **Run analysis on titles**
|
| 164 |
+
below to begin Phase 2 (Generating Initial Codes)."
|
| 165 |
+
|
| 166 |
+
CRITICAL: Always say "data items" (not "papers"), "data extracts"
|
| 167 |
+
(not "sentences"), and always reference the EXACT button labels
|
| 168 |
+
"Run analysis on abstracts" / "Run analysis on titles" β not
|
| 169 |
+
"type 'run abstract'" which is old instruction and does not match
|
| 170 |
+
any UI element.
|
| 171 |
+
STOP. Wait.
|
| 172 |
+
|
| 173 |
+
PHASE 2 β GENERATING INITIAL CODES
|
| 174 |
+
"Coding interesting features of the data in a systematic fashion
|
| 175 |
+
across the entire data set, collating data relevant to each code."
|
| 176 |
+
(B&C, 2006, p.87)
|
| 177 |
+
|
| 178 |
+
Operationalisation: Embed each data extract into a 384-dimensional
|
| 179 |
+
vector (Sentence-BERT), cluster using Agglomerative Clustering with
|
| 180 |
+
cosine distance threshold 0.50, enforce minimum 5 extracts per code.
|
| 181 |
+
Extracts in dissolved codes become orphan extracts (label=-1).
|
| 182 |
+
|
| 183 |
+
Call run_bertopic_discovery FIRST (generates initial codes).
|
| 184 |
+
Then IMMEDIATELY call label_topics_with_llm (names initial codes).
|
| 185 |
+
BOTH tools must run before stopping β the reviewer needs to see
|
| 186 |
+
LABELLED initial codes, not numeric IDs.
|
| 187 |
+
|
| 188 |
+
Report format (USE EXACT WORDING):
|
| 189 |
+
"Generated N initial codes from M data extracts (X orphan extracts
|
| 190 |
+
did not fit any code β minimum 5 extracts required per code).
|
| 191 |
+
Labelled all N initial codes using Mistral.
|
| 192 |
+
|
| 193 |
+
Initial codes are loaded in the Review Table below. Please
|
| 194 |
+
review, edit if needed, and click **Submit Review**. Then click
|
| 195 |
+
**Proceed to searching for themes** to begin Phase 3."
|
| 196 |
+
|
| 197 |
+
STOP GATE 1 (Initial Code Quality):
|
| 198 |
+
SG1-A: fewer than 5 initial codes
|
| 199 |
+
SG1-B: average confidence < 0.40
|
| 200 |
+
SG1-C: > 40% of codes are generic placeholders
|
| 201 |
+
SG1-D: duplicate code labels
|
| 202 |
+
[WAITING FOR REVIEW TABLE]. STOP.
|
| 203 |
+
On Submit Review: if Move To values exist, call reassign_sentences
|
| 204 |
+
to move extracts between initial codes.
|
| 205 |
+
|
| 206 |
+
PHASE 3 β SEARCHING FOR THEMES
|
| 207 |
+
"Collating codes into potential themes, gathering all data relevant
|
| 208 |
+
to each potential theme." (B&C, 2006, p.87)
|
| 209 |
+
|
| 210 |
+
Operationalisation: Call consolidate_into_themes β merges semantically
|
| 211 |
+
related initial codes into candidate themes using centroid similarity,
|
| 212 |
+
produces a hierarchical thematic map.
|
| 213 |
+
|
| 214 |
+
Report format (USE EXACT WORDING):
|
| 215 |
+
"Collated N initial codes into K candidate themes. Thematic map
|
| 216 |
+
saved.
|
| 217 |
+
|
| 218 |
+
Candidate themes are loaded in the Review Table below. Please
|
| 219 |
+
review, edit if needed, and click **Submit Review**. Then click
|
| 220 |
+
**Proceed to reviewing themes** to begin Phase 4."
|
| 221 |
+
|
| 222 |
+
STOP GATE 2 (Candidate Theme Coherence):
|
| 223 |
+
SG2-A: fewer than 3 candidate themes
|
| 224 |
+
SG2-B: any singleton theme (only 1 code)
|
| 225 |
+
SG2-C: duplicate candidate themes
|
| 226 |
+
SG2-D: total data coverage < 50%
|
| 227 |
+
[WAITING FOR REVIEW TABLE]. STOP.
|
| 228 |
+
|
| 229 |
+
PHASE 4 β REVIEWING THEMES
|
| 230 |
+
"Checking if the themes work in relation to the coded extracts
|
| 231 |
+
(Level 1) and the entire data set (Level 2), generating a thematic
|
| 232 |
+
'map' of the analysis." (B&C, 2006, p.87)
|
| 233 |
+
|
| 234 |
+
Operationalisation: Call compute_saturation to compute Level 1
|
| 235 |
+
metrics (intra-theme coherence against member extracts) and Level 2
|
| 236 |
+
metrics (coverage of entire data set, theme balance). NEVER compute
|
| 237 |
+
these numbers yourself β always present the EXACT values returned
|
| 238 |
+
by the tool.
|
| 239 |
+
|
| 240 |
+
Report format (USE EXACT WORDING):
|
| 241 |
+
"Theme review complete.
|
| 242 |
+
Level 1 (extract-level): mean intra-theme coherence = X.
|
| 243 |
+
Level 2 (corpus-level): data coverage = Y%, theme balance = Z.
|
| 244 |
+
|
| 245 |
+
Theme review metrics are loaded in the Review Table below. Please
|
| 246 |
+
review, edit if needed, and click **Submit Review**. Then click
|
| 247 |
+
**Proceed to defining themes** to begin Phase 5."
|
| 248 |
+
|
| 249 |
+
STOP GATE 3 (Theme Review Adequacy):
|
| 250 |
+
SG3-A: Level 2 coverage < 60%
|
| 251 |
+
SG3-B: any single theme covers > 60% of data items
|
| 252 |
+
SG3-C: Level 1 coherence < 0.30
|
| 253 |
+
SG3-D: fewer than 3 themes survived review
|
| 254 |
+
[WAITING FOR REVIEW TABLE]. STOP.
|
| 255 |
+
|
| 256 |
+
PHASE 5 β DEFINING AND NAMING THEMES
|
| 257 |
+
"Ongoing analysis to refine the specifics of each theme, and the
|
| 258 |
+
overall story the analysis tells, generating clear definitions and
|
| 259 |
+
names for each theme." (B&C, 2006, p.87)
|
| 260 |
+
|
| 261 |
+
Operationalisation: Call generate_theme_profiles to retrieve the
|
| 262 |
+
top-5 representative data extracts per theme (nearest to centroid).
|
| 263 |
+
NEVER recall extract text from memory β always present the EXACT
|
| 264 |
+
extracts returned by the tool. Propose definitions based on these
|
| 265 |
+
real extracts.
|
| 266 |
+
|
| 267 |
+
Report format (USE EXACT WORDING):
|
| 268 |
+
"Generated definitions and names for K themes based on the top-5
|
| 269 |
+
most representative data extracts per theme.
|
| 270 |
+
|
| 271 |
+
Theme definitions are loaded in the Review Table below. Please
|
| 272 |
+
review, edit if needed, and click **Submit Review**. Then click
|
| 273 |
+
**Proceed to producing the report** to begin Phase 6."
|
| 274 |
+
|
| 275 |
+
[WAITING FOR REVIEW TABLE]. STOP.
|
| 276 |
+
|
| 277 |
+
PHASE 5.5 β TAXONOMY ALIGNMENT (extension to B&C)
|
| 278 |
+
Call compare_with_taxonomy to map defined themes to the PAJAIS 25
|
| 279 |
+
information-systems research categories (Jiang et al., 2019) for
|
| 280 |
+
deductive validation.
|
| 281 |
+
|
| 282 |
+
STOP GATE 4 (Taxonomy Alignment Quality):
|
| 283 |
+
SG4-A: any theme maps to zero categories
|
| 284 |
+
SG4-B: > 30% of alignment scores < 0.40
|
| 285 |
+
SG4-C: single PAJAIS category covers > 50% of themes
|
| 286 |
+
SG4-D: incomplete alignment
|
| 287 |
+
[WAITING FOR REVIEW TABLE]. STOP.
|
| 288 |
+
|
| 289 |
+
PHASE 6 β PRODUCING THE REPORT
|
| 290 |
+
"The final opportunity for analysis. Selection of vivid, compelling
|
| 291 |
+
extract examples, final analysis of selected extracts, relating
|
| 292 |
+
back of the analysis to the research question and literature,
|
| 293 |
+
producing a scholarly report of the analysis." (B&C, 2006, p.87)
|
| 294 |
+
|
| 295 |
+
Operationalisation: Call generate_comparison_csv (convergence/
|
| 296 |
+
divergence summary). Present summary, stop for review.
|
| 297 |
+
|
| 298 |
+
STOP GATE 5 (Comparison Review):
|
| 299 |
+
Reviewer confirms convergence/divergence pattern is meaningful.
|
| 300 |
+
[WAITING FOR REVIEW TABLE]. STOP.
|
| 301 |
+
|
| 302 |
+
Then call export_narrative (scholarly 500-word narrative using
|
| 303 |
+
selected vivid extracts).
|
| 304 |
+
|
| 305 |
+
STOP GATE 6 (Scholarly Report Approval):
|
| 306 |
+
Reviewer approves final written narrative.
|
| 307 |
+
[WAITING FOR REVIEW TABLE]. STOP.
|
| 308 |
+
DONE β all 6 STOP gates passed, analysis complete.
|
| 309 |
+
|
| 310 |
+
6 STOP GATES:
|
| 311 |
+
STOP-1 (Phase 2) : Initial Code Quality
|
| 312 |
+
STOP-2 (Phase 3) : Candidate Theme Coherence
|
| 313 |
+
STOP-3 (Phase 4) : Theme Review Adequacy
|
| 314 |
+
STOP-4 (Phase 5.5) : Taxonomy Alignment Quality
|
| 315 |
+
STOP-5 (Phase 6) : Comparison Review
|
| 316 |
+
STOP-6 (Phase 6) : Scholarly Report Approval
|
| 317 |
+
"""
|
| 318 |
+
|
| 319 |
+
llm = ChatMistralAI(model="mistral-large-latest", temperature=0, max_tokens=8192)
|
| 320 |
+
|
| 321 |
+
memory = InMemorySaver()
|
| 322 |
+
|
| 323 |
+
agent = create_agent(
|
| 324 |
+
model=llm,
|
| 325 |
+
tools=ALL_TOOLS,
|
| 326 |
+
system_prompt=SYSTEM_PROMPT,
|
| 327 |
+
checkpointer=memory,
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def run(user_message: str, thread_id: str = "default") -> str:
|
| 332 |
+
"""Invoke the agent for one conversation turn."""
|
| 333 |
+
config = {"configurable": {"thread_id": thread_id}}
|
| 334 |
+
payload = {"messages": [{"role": "user", "content": user_message}]}
|
| 335 |
+
result = agent.invoke(payload, config=config)
|
| 336 |
+
msgs = result.get("messages", [])
|
| 337 |
+
return (msgs and msgs[-1].content) or ""
|
app.py
ADDED
|
@@ -0,0 +1,548 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py β Braun & Clarke (2006) Thematic Analysis Agent UI.
|
| 3 |
+
|
| 4 |
+
Implements the 6-phase reflexive thematic analysis procedure from
|
| 5 |
+
Braun, V., & Clarke, V. (2006). Using thematic analysis in psychology.
|
| 6 |
+
Qualitative Research in Psychology, 3(2), 77-101.
|
| 7 |
+
|
| 8 |
+
Three UX features:
|
| 9 |
+
1. Phase banner β large prominent display of current B&C phase
|
| 10 |
+
2. Dynamic phase actions β only actions valid for current phase shown
|
| 11 |
+
3. Auto-populated review table β loads from tool checkpoint files
|
| 12 |
+
|
| 13 |
+
9-column review table: #, Code/Theme Label, Data Extract, Extracts,
|
| 14 |
+
Data Items, Approve, Rename To, Move To, Analytic Memo.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import gradio as gr
|
| 18 |
+
import pandas as pd
|
| 19 |
+
import json
|
| 20 |
+
import os
|
| 21 |
+
import re
|
| 22 |
+
import tempfile
|
| 23 |
+
from datetime import datetime
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from agent import run as agent_run
|
| 26 |
+
|
| 27 |
+
THREAD_ID = f"thematic-analysis-{datetime.now().strftime('%Y%m%d%H%M%S')}"
|
| 28 |
+
|
| 29 |
+
REVIEW_COLS = [
|
| 30 |
+
"#", "Code / Theme Label", "Data Extract", "Extracts", "Data Items",
|
| 31 |
+
"Approve", "Rename To", "Move To", "Analytic Memo",
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
EMPTY_TABLE = pd.DataFrame(
|
| 35 |
+
{"#": ["-"], "Code / Theme Label": ["No codes yet β run analysis first"],
|
| 36 |
+
"Data Extract": [""], "Extracts": [""], "Data Items": [""],
|
| 37 |
+
"Approve": [""], "Rename To": [""], "Move To": [""], "Analytic Memo": [""]},
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
PHASE_INFO = {
|
| 41 |
+
0: ("Getting started", "β¬β¬β¬β¬β¬β¬",
|
| 42 |
+
"Upload your Scopus CSV data set, then click **Analyse my data set**"),
|
| 43 |
+
1: ("Phase 1 β Familiarisation with the Data", "π¦β¬β¬β¬β¬β¬",
|
| 44 |
+
"Click **Run analysis on abstracts** or **Run analysis on titles** "
|
| 45 |
+
"to begin familiarisation with the data corpus"),
|
| 46 |
+
2: ("Phase 2 β Generating Initial Codes", "π¦π¦β¬β¬β¬β¬",
|
| 47 |
+
"Review initial codes in the table below. Edit Approve / Rename / "
|
| 48 |
+
"Move extracts, then click **Submit Review** to collate codes into themes"),
|
| 49 |
+
3: ("Phase 3 β Searching for Themes", "π¦π¦π¦β¬β¬β¬",
|
| 50 |
+
"Review candidate themes (collated initial codes). Edit the table "
|
| 51 |
+
"and click **Submit Review** to proceed to theme review"),
|
| 52 |
+
4: ("Phase 4 β Reviewing Themes", "π¦π¦π¦π¦β¬β¬",
|
| 53 |
+
"Review themes against coded extracts (Level 1) and the entire "
|
| 54 |
+
"data set (Level 2). Click **Submit Review** to confirm"),
|
| 55 |
+
5: ("Phase 5 β Defining and Naming Themes", "π¦π¦π¦π¦π¦β¬",
|
| 56 |
+
"Review theme definitions and names. Edit and click **Submit Review**"),
|
| 57 |
+
6: ("Phase 6 β Producing the Report", "π¦π¦π¦π¦π¦π¦",
|
| 58 |
+
"Review the scholarly report and thematic map. "
|
| 59 |
+
"**Submit Review** to finalise"),
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
PHASE_PROMPTS = {
|
| 63 |
+
0: ["Analyse my data set"],
|
| 64 |
+
1: ["Run analysis on abstracts", "Run analysis on titles",
|
| 65 |
+
"Show data corpus statistics"],
|
| 66 |
+
2: ["Proceed to searching for themes", "Show initial codes",
|
| 67 |
+
"How many orphan extracts?"],
|
| 68 |
+
3: ["Proceed to reviewing themes", "Show candidate themes",
|
| 69 |
+
"Explain theme collation"],
|
| 70 |
+
4: ["Proceed to defining themes", "Show thematic map"],
|
| 71 |
+
5: ["Proceed to producing the report", "Show theme definitions",
|
| 72 |
+
"Compare themes with PAJAIS taxonomy"],
|
| 73 |
+
6: ["Produce final scholarly report", "Show comparison table",
|
| 74 |
+
"Export all results"],
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
REFERENCES_MD = """
|
| 78 |
+
## Methodology References
|
| 79 |
+
|
| 80 |
+
Click any link to open the paper in a new tab. These are the foundational
|
| 81 |
+
papers you can cite in your methodology section.
|
| 82 |
+
|
| 83 |
+
---
|
| 84 |
+
|
| 85 |
+
### π Thematic Analysis (the method)
|
| 86 |
+
|
| 87 |
+
**Braun, V., & Clarke, V. (2006).** Using thematic analysis in psychology.
|
| 88 |
+
*Qualitative Research in Psychology*, 3(2), 77β101.
|
| 89 |
+
π [DOI: 10.1191/1478088706qp063oa](https://doi.org/10.1191/1478088706qp063oa)
|
| 90 |
+
|
| 91 |
+
> The foundational paper defining the six-phase reflexive thematic
|
| 92 |
+
> analysis procedure. Cite this as the primary methodology reference.
|
| 93 |
+
> Every phase name, terminology, and review step in this agent maps
|
| 94 |
+
> directly to the procedures on pp. 87β93.
|
| 95 |
+
|
| 96 |
+
**Braun, V., & Clarke, V. (2019).** Reflecting on reflexive thematic analysis.
|
| 97 |
+
*Qualitative Research in Sport, Exercise and Health*, 11(4), 589β597.
|
| 98 |
+
π [DOI: 10.1080/2159676X.2019.1628806](https://doi.org/10.1080/2159676X.2019.1628806)
|
| 99 |
+
|
| 100 |
+
> A later clarification emphasising the reflexive, recursive, and
|
| 101 |
+
> researcher-in-the-loop nature of the method. Useful for defending
|
| 102 |
+
> the human-approval design of this agent.
|
| 103 |
+
|
| 104 |
+
**Braun, V., & Clarke, V. (2021).** One size fits all? What counts as
|
| 105 |
+
quality practice in (reflexive) thematic analysis? *Qualitative Research
|
| 106 |
+
in Psychology*, 18(3), 328β352.
|
| 107 |
+
π [DOI: 10.1080/14780887.2020.1769238](https://doi.org/10.1080/14780887.2020.1769238)
|
| 108 |
+
|
| 109 |
+
> Quality criteria for thematic analysis β useful for defending the
|
| 110 |
+
> STOP gate design as reviewer-approval checkpoints.
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
### π§ Embedding Model (Sentence-BERT)
|
| 115 |
+
|
| 116 |
+
**Reimers, N., & Gurevych, I. (2019).** Sentence-BERT: Sentence Embeddings
|
| 117 |
+
using Siamese BERT-Networks. *Proceedings of EMNLP-IJCNLP 2019*.
|
| 118 |
+
π [arXiv: 1908.10084](https://arxiv.org/abs/1908.10084)
|
| 119 |
+
|
| 120 |
+
> The paper behind `sentence-transformers/all-MiniLM-L6-v2`, the embedding
|
| 121 |
+
> model used to convert data extracts into 384-dimensional vectors.
|
| 122 |
+
> Establishes cosine similarity as the canonical comparison metric for
|
| 123 |
+
> SBERT embeddings β justifies our use of cosine distance.
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
+
|
| 127 |
+
### π¬ Topic Modelling Framework (BERTopic)
|
| 128 |
+
|
| 129 |
+
**Grootendorst, M. (2022).** BERTopic: Neural topic modeling with a
|
| 130 |
+
class-based TF-IDF procedure. *arXiv preprint*.
|
| 131 |
+
π [arXiv: 2203.05794](https://arxiv.org/abs/2203.05794)
|
| 132 |
+
|
| 133 |
+
> The BERTopic framework. Our approach follows its documented
|
| 134 |
+
> Agglomerative Clustering configuration with `distance_threshold=0.5`
|
| 135 |
+
> as a substitute for HDBSCAN when fine-grained control over code
|
| 136 |
+
> granularity is required.
|
| 137 |
+
|
| 138 |
+
---
|
| 139 |
+
|
| 140 |
+
### βοΈ Clustering Algorithm (scikit-learn)
|
| 141 |
+
|
| 142 |
+
**Pedregosa, F., et al. (2011).** Scikit-learn: Machine Learning in Python.
|
| 143 |
+
*Journal of Machine Learning Research*, 12, 2825β2830.
|
| 144 |
+
π [JMLR](https://jmlr.org/papers/v12/pedregosa11a.html)
|
| 145 |
+
|
| 146 |
+
> Cite this for `sklearn.cluster.AgglomerativeClustering` with
|
| 147 |
+
> `metric='cosine'`, `linkage='average'`, `distance_threshold=0.50`.
|
| 148 |
+
|
| 149 |
+
**MΓΌllner, D. (2011).** Modern hierarchical, agglomerative clustering
|
| 150 |
+
algorithms. *arXiv preprint*.
|
| 151 |
+
π [arXiv: 1109.2378](https://arxiv.org/abs/1109.2378)
|
| 152 |
+
|
| 153 |
+
> Comprehensive reference for agglomerative clustering algorithms and
|
| 154 |
+
> linkage methods β useful for justifying the choice of `average`
|
| 155 |
+
> linkage over `ward` for cosine-distance data.
|
| 156 |
+
|
| 157 |
+
---
|
| 158 |
+
|
| 159 |
+
### π€ Language Model (Mistral)
|
| 160 |
+
|
| 161 |
+
**Jiang, A. Q., et al. (2023).** Mistral 7B. *arXiv preprint*.
|
| 162 |
+
π [arXiv: 2310.06825](https://arxiv.org/abs/2310.06825)
|
| 163 |
+
|
| 164 |
+
> The family of LLMs used for initial code labelling and narrative
|
| 165 |
+
> generation. Our agent uses `mistral-large-latest` for these
|
| 166 |
+
> LLM-dependent tool calls.
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
+
### π LangChain / LangGraph
|
| 171 |
+
|
| 172 |
+
**Chase, H., et al. (2023).** LangChain. *GitHub repository*.
|
| 173 |
+
π [github.com/langchain-ai/langchain](https://github.com/langchain-ai/langchain)
|
| 174 |
+
|
| 175 |
+
**Chase, H., et al. (2024).** LangGraph. *GitHub repository*.
|
| 176 |
+
π [github.com/langchain-ai/langgraph](https://github.com/langchain-ai/langgraph)
|
| 177 |
+
|
| 178 |
+
> The agent orchestration framework. `create_agent` (LangChain v1)
|
| 179 |
+
> with `InMemorySaver` (LangGraph) provides the stateful multi-turn
|
| 180 |
+
> conversation with tool-use capability underlying this agent.
|
| 181 |
+
|
| 182 |
+
---
|
| 183 |
+
|
| 184 |
+
### π¨ User Interface (Gradio)
|
| 185 |
+
|
| 186 |
+
**Abid, A., et al. (2019).** Gradio: Hassle-free sharing and testing of
|
| 187 |
+
ML models in the wild. *arXiv preprint*.
|
| 188 |
+
π [arXiv: 1906.02569](https://arxiv.org/abs/1906.02569)
|
| 189 |
+
|
| 190 |
+
> The web UI framework. This application uses Gradio 6.x components:
|
| 191 |
+
> `gr.Blocks`, `gr.Chatbot`, `gr.Dataframe`, `gr.File`, etc.
|
| 192 |
+
|
| 193 |
+
---
|
| 194 |
+
|
| 195 |
+
## How to cite this agent in your report
|
| 196 |
+
|
| 197 |
+
> "Thematic analysis was conducted following Braun and Clarke's (2006)
|
| 198 |
+
> six-phase reflexive procedure, computationally assisted using a
|
| 199 |
+
> researcher-in-the-loop agent. Data extracts were embedded using
|
| 200 |
+
> `all-MiniLM-L6-v2` (Reimers & Gurevych, 2019), clustered with
|
| 201 |
+
> `sklearn.cluster.AgglomerativeClustering` (Pedregosa et al., 2011)
|
| 202 |
+
> using `metric='cosine'`, `linkage='average'`, and
|
| 203 |
+
> `distance_threshold=0.50`, following the Agglomerative Clustering
|
| 204 |
+
> configuration documented in the BERTopic framework (Grootendorst, 2022).
|
| 205 |
+
> Initial code labels and the final scholarly narrative were generated
|
| 206 |
+
> using `mistral-large-latest` (Jiang et al., 2023). At every phase
|
| 207 |
+
> boundary, the researcher reviewed and approved computational outputs
|
| 208 |
+
> via a structured review table before the analysis advanced, preserving
|
| 209 |
+
> the reflexive, recursive, and analyst-led character of thematic
|
| 210 |
+
> analysis (Braun & Clarke, 2019; 2021)."
|
| 211 |
+
"""
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def _prompt_button_updates(phase: int) -> tuple:
|
| 215 |
+
"""Return gr.update values for the 4 phase-specific prompt buttons.
|
| 216 |
+
|
| 217 |
+
Shows only prompts relevant to the current phase. Unused buttons
|
| 218 |
+
are hidden (visible=False) so the UI stays clean.
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
Tuple of 4 gr.update objects for btn1, btn2, btn3, btn4.
|
| 222 |
+
"""
|
| 223 |
+
prompts = (PHASE_PROMPTS.get(phase, PHASE_PROMPTS[0]) + [""] * 4)[:4]
|
| 224 |
+
return tuple(
|
| 225 |
+
gr.update(value=p, visible=bool(p))
|
| 226 |
+
for p in prompts
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
_path = lambda file: str(
|
| 230 |
+
(hasattr(file, "name") and file.name)
|
| 231 |
+
or (isinstance(file, str) and file)
|
| 232 |
+
or ""
|
| 233 |
+
)
|
| 234 |
+
_name = lambda file: os.path.basename(_path(file))
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def _extract_phase(text: str) -> int:
|
| 238 |
+
"""Extract phase number from agent response. Returns 0 if not found."""
|
| 239 |
+
found = re.findall(r"Phase (\d)", str(text))
|
| 240 |
+
return int((found or ["0"])[0])
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def _phase_banner(num: int) -> str:
|
| 244 |
+
"""Generate prominent phase banner with progress bar and next step."""
|
| 245 |
+
name, progress, instruction = PHASE_INFO.get(num, PHASE_INFO[0])
|
| 246 |
+
return (
|
| 247 |
+
f"## {progress} {name}\n\n"
|
| 248 |
+
f"**NEXT STEP β** {instruction}"
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def _load_review_table(base_dir: str) -> pd.DataFrame:
|
| 253 |
+
"""Load latest checkpoint file into the 9-column review table.
|
| 254 |
+
|
| 255 |
+
Scans base_dir for topic_labels.json, themes.json, taxonomy_alignment.json,
|
| 256 |
+
summaries.json. Loads the most recently modified one and formats it.
|
| 257 |
+
Returns EMPTY_TABLE if nothing found.
|
| 258 |
+
"""
|
| 259 |
+
base = Path(str(base_dir or "/tmp/nonexistent_dir_placeholder"))
|
| 260 |
+
candidates = (
|
| 261 |
+
base_dir and base.exists() and sorted(
|
| 262 |
+
(
|
| 263 |
+
list(base.glob("topic_labels.json"))
|
| 264 |
+
+ list(base.glob("themes.json"))
|
| 265 |
+
+ list(base.glob("taxonomy_alignment.json"))
|
| 266 |
+
+ list(base.glob("summaries.json"))
|
| 267 |
+
),
|
| 268 |
+
key=lambda p: p.stat().st_mtime,
|
| 269 |
+
reverse=True,
|
| 270 |
+
)
|
| 271 |
+
) or []
|
| 272 |
+
|
| 273 |
+
latest = (candidates[:1] or [None])[0]
|
| 274 |
+
return (latest and [_format_checkpoint(latest)] or [EMPTY_TABLE.copy()])[0]
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def _format_checkpoint(path) -> pd.DataFrame:
|
| 278 |
+
"""Format a checkpoint JSON file into review table rows.
|
| 279 |
+
|
| 280 |
+
Merges data from multiple checkpoint files when available:
|
| 281 |
+
topic_labels.json has labels but no sizes β summaries.json has sizes.
|
| 282 |
+
"""
|
| 283 |
+
raw = json.loads(Path(path).read_text())
|
| 284 |
+
base = Path(path).parent
|
| 285 |
+
|
| 286 |
+
data = (isinstance(raw, dict) and raw.get("clusters", raw.get("per_theme", []))) or \
|
| 287 |
+
(isinstance(raw, list) and raw) or []
|
| 288 |
+
|
| 289 |
+
summaries_data = {}
|
| 290 |
+
summaries_path = base / "summaries.json"
|
| 291 |
+
summaries_raw = (
|
| 292 |
+
summaries_path.exists() and json.loads(summaries_path.read_text()) or {}
|
| 293 |
+
)
|
| 294 |
+
summaries_list = (
|
| 295 |
+
isinstance(summaries_raw, dict) and summaries_raw.get("clusters", [])
|
| 296 |
+
) or (isinstance(summaries_raw, list) and summaries_raw) or []
|
| 297 |
+
list(map(
|
| 298 |
+
lambda s: summaries_data.update({s.get("topic_id", -999): s}),
|
| 299 |
+
summaries_list,
|
| 300 |
+
))
|
| 301 |
+
|
| 302 |
+
def _row(item: dict) -> dict:
|
| 303 |
+
"""Map one JSON item to review table columns, merging summaries data."""
|
| 304 |
+
tid = item.get("topic_id", item.get("theme_id", 0))
|
| 305 |
+
summary = summaries_data.get(tid, {})
|
| 306 |
+
return {
|
| 307 |
+
"#": tid,
|
| 308 |
+
"Code / Theme Label": item.get("label", item.get("theme_label", "")),
|
| 309 |
+
"Data Extract": str(
|
| 310 |
+
item.get("representative", "")
|
| 311 |
+
or summary.get("representative", "")
|
| 312 |
+
or item.get("notes", "")
|
| 313 |
+
)[:150],
|
| 314 |
+
"Extracts": item.get("size", 0) or summary.get("size", 0)
|
| 315 |
+
or item.get("total_papers", 0),
|
| 316 |
+
"Data Items": item.get("size", 0) or summary.get("size", 0)
|
| 317 |
+
or item.get("total_papers", 0),
|
| 318 |
+
"Approve": "Yes",
|
| 319 |
+
"Rename To": "",
|
| 320 |
+
"Move To": "",
|
| 321 |
+
"Analytic Memo": str(item.get("rationale",
|
| 322 |
+
item.get("notes", ""))),
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
rows = list(map(_row, data[:200]))
|
| 326 |
+
return (rows and [pd.DataFrame(rows, columns=REVIEW_COLS)] or [EMPTY_TABLE.copy()])[0]
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def on_file_upload(file):
|
| 330 |
+
"""Extract CSV stats and return updates for info, state, banner, buttons."""
|
| 331 |
+
path = _path(file)
|
| 332 |
+
default = (
|
| 333 |
+
"Upload a CSV to begin.", "", _phase_banner(0),
|
| 334 |
+
*_prompt_button_updates(0),
|
| 335 |
+
)
|
| 336 |
+
return (not path) and default or _do_file_upload(path, file)
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def _do_file_upload(path: str, file) -> tuple:
|
| 340 |
+
"""Actual file processing after path validation."""
|
| 341 |
+
df = pd.read_csv(path)
|
| 342 |
+
rows, cols = df.shape
|
| 343 |
+
base = str(Path(path).parent)
|
| 344 |
+
info = (
|
| 345 |
+
f"**Loaded:** `{_name(file)}`\n\n"
|
| 346 |
+
f"**Shape:** {rows:,} rows x {cols} columns\n\n"
|
| 347 |
+
f"**Columns:** {', '.join(df.columns[:6].tolist())}\n\n"
|
| 348 |
+
f"*Click a prompt below and press Send to begin.*"
|
| 349 |
+
)
|
| 350 |
+
return (info, base, _phase_banner(1), *_prompt_button_updates(1))
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def on_send(user_msg, history, file, base_dir):
|
| 354 |
+
"""Pass user message to agent. Update banner, table, and prompt buttons."""
|
| 355 |
+
msg = (user_msg or "").strip() or "help"
|
| 356 |
+
csv_tag = f"[CSV: {_path(file)}]\n" * bool(file)
|
| 357 |
+
|
| 358 |
+
history = list(history or [])
|
| 359 |
+
history.append({"role": "user", "content": msg})
|
| 360 |
+
history.append({"role": "assistant", "content": "Thinking..."})
|
| 361 |
+
yield (
|
| 362 |
+
history, "", gr.skip(), gr.skip(), gr.skip(),
|
| 363 |
+
gr.skip(), gr.skip(), gr.skip(), gr.skip(),
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
reply = agent_run(csv_tag + msg, thread_id=THREAD_ID)
|
| 367 |
+
history[-1] = {"role": "assistant", "content": reply}
|
| 368 |
+
|
| 369 |
+
phase = _extract_phase(reply)
|
| 370 |
+
banner = _phase_banner(phase)
|
| 371 |
+
table = _load_review_table(base_dir)
|
| 372 |
+
btn_updates = _prompt_button_updates(phase)
|
| 373 |
+
|
| 374 |
+
yield (history, "", banner, table, base_dir, *btn_updates)
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
def on_submit_review(table_df, history, base_dir):
|
| 378 |
+
"""Serialise review table edits to agent. Return updated UI."""
|
| 379 |
+
history = list(history or [])
|
| 380 |
+
edits = table_df.to_json(orient="records", indent=2)
|
| 381 |
+
|
| 382 |
+
history.append({"role": "user", "content": "[REVIEW SUBMITTED]"})
|
| 383 |
+
history.append({"role": "assistant", "content": "Processing review..."})
|
| 384 |
+
|
| 385 |
+
reply = agent_run(
|
| 386 |
+
"Reviewer submitted table edits.\n\n"
|
| 387 |
+
f"```json\n{edits}\n```\n\n"
|
| 388 |
+
"Process: Approve/Reject decisions, Rename To values, "
|
| 389 |
+
"Move To reassignments (call reassign_sentences if moves exist), "
|
| 390 |
+
"Reasoning notes. Then check STOP gates and proceed.",
|
| 391 |
+
thread_id=THREAD_ID,
|
| 392 |
+
)
|
| 393 |
+
history[-1] = {"role": "assistant", "content": reply}
|
| 394 |
+
|
| 395 |
+
phase = _extract_phase(reply)
|
| 396 |
+
return (
|
| 397 |
+
history, _phase_banner(phase), _load_review_table(base_dir),
|
| 398 |
+
*_prompt_button_updates(phase),
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
def on_download(table_df, history):
|
| 403 |
+
"""Export review CSV and chat TXT."""
|
| 404 |
+
csv_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", prefix="review_")
|
| 405 |
+
table_df.to_csv(csv_tmp.name, index=False)
|
| 406 |
+
|
| 407 |
+
txt_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", prefix="chat_")
|
| 408 |
+
txt_tmp.write(
|
| 409 |
+
"\n\n".join(
|
| 410 |
+
list(map(
|
| 411 |
+
lambda m: f"{m.get('role', '').upper()}: {m.get('content', '')}",
|
| 412 |
+
history or [],
|
| 413 |
+
))
|
| 414 |
+
).encode("utf-8")
|
| 415 |
+
)
|
| 416 |
+
txt_tmp.close()
|
| 417 |
+
return [csv_tmp.name, txt_tmp.name]
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
with gr.Blocks(title="Thematic Analysis Agent") as demo:
|
| 421 |
+
|
| 422 |
+
base_dir_state = gr.State(value="")
|
| 423 |
+
|
| 424 |
+
gr.Markdown("# Thematic Analysis Agent")
|
| 425 |
+
gr.Markdown(
|
| 426 |
+
"**Braun & Clarke (2006) 6-Phase Reflexive Thematic Analysis** "
|
| 427 |
+
"| Sentence-BERT Embeddings | Agglomerative Clustering | "
|
| 428 |
+
"Cosine Distance 0.50"
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
phase_banner = gr.Markdown(value=_phase_banner(0))
|
| 432 |
+
|
| 433 |
+
with gr.Tabs():
|
| 434 |
+
|
| 435 |
+
with gr.Tab("π¬ Analysis"):
|
| 436 |
+
gr.Markdown("---\n### Section 1 β Data Corpus")
|
| 437 |
+
with gr.Row():
|
| 438 |
+
with gr.Column(scale=3):
|
| 439 |
+
file_input = gr.File(
|
| 440 |
+
label="Upload data corpus (Scopus CSV)",
|
| 441 |
+
file_types=[".csv"],
|
| 442 |
+
file_count="single",
|
| 443 |
+
)
|
| 444 |
+
with gr.Column(scale=5):
|
| 445 |
+
file_info = gr.Markdown("Upload a CSV to begin.")
|
| 446 |
+
|
| 447 |
+
gr.Markdown("---\n### Section 2 β Analyst Dialogue")
|
| 448 |
+
chatbot = gr.Chatbot(label="Thematic Analysis Agent", height=200)
|
| 449 |
+
with gr.Row():
|
| 450 |
+
msg_box = gr.Textbox(
|
| 451 |
+
placeholder="Type a message or click a phase action below",
|
| 452 |
+
show_label=False, scale=7, lines=1,
|
| 453 |
+
)
|
| 454 |
+
send_btn = gr.Button("Send", variant="primary", scale=1)
|
| 455 |
+
|
| 456 |
+
gr.Markdown("**Phase actions** (click to proceed β only actions "
|
| 457 |
+
"valid for the current B&C phase are shown)")
|
| 458 |
+
with gr.Row():
|
| 459 |
+
prompt_btn_1 = gr.Button("Analyse my data set",
|
| 460 |
+
variant="secondary", scale=1, size="sm")
|
| 461 |
+
prompt_btn_2 = gr.Button("", variant="secondary", scale=1,
|
| 462 |
+
size="sm", visible=False)
|
| 463 |
+
prompt_btn_3 = gr.Button("", variant="secondary", scale=1,
|
| 464 |
+
size="sm", visible=False)
|
| 465 |
+
prompt_btn_4 = gr.Button("", variant="secondary", scale=1,
|
| 466 |
+
size="sm", visible=False)
|
| 467 |
+
|
| 468 |
+
gr.Markdown("---\n### Section 3 β Initial Codes / Candidate Themes / Themes")
|
| 469 |
+
gr.Markdown(
|
| 470 |
+
"Auto-populated from tool outputs. Labels are **initial codes** "
|
| 471 |
+
"in Phase 2, **candidate themes** in Phase 3, and **themes** in "
|
| 472 |
+
"Phases 4β6. Edit **Approve**, **Rename To**, **Move To**, "
|
| 473 |
+
"**Analytic Memo** columns, then click **Submit Review**."
|
| 474 |
+
)
|
| 475 |
+
review_table = gr.Dataframe(
|
| 476 |
+
value=EMPTY_TABLE,
|
| 477 |
+
headers=REVIEW_COLS,
|
| 478 |
+
datatype=["number", "str", "str", "number", "number",
|
| 479 |
+
"str", "str", "str", "str"],
|
| 480 |
+
column_count=(9, "fixed"),
|
| 481 |
+
interactive=True,
|
| 482 |
+
wrap=True,
|
| 483 |
+
max_height=400,
|
| 484 |
+
)
|
| 485 |
+
with gr.Row():
|
| 486 |
+
clear_btn = gr.Button("Clear table", variant="secondary", scale=2)
|
| 487 |
+
sub_btn = gr.Button("Submit Review", variant="primary", scale=4)
|
| 488 |
+
|
| 489 |
+
with gr.Accordion("Download", open=False):
|
| 490 |
+
dl_btn = gr.Button("Generate downloads", variant="primary")
|
| 491 |
+
dl_files = gr.File(label="Downloads", file_count="multiple",
|
| 492 |
+
interactive=False)
|
| 493 |
+
|
| 494 |
+
with gr.Tab("π References"):
|
| 495 |
+
gr.Markdown(REFERENCES_MD)
|
| 496 |
+
|
| 497 |
+
file_input.change(
|
| 498 |
+
on_file_upload,
|
| 499 |
+
inputs=[file_input],
|
| 500 |
+
outputs=[file_info, base_dir_state, phase_banner,
|
| 501 |
+
prompt_btn_1, prompt_btn_2, prompt_btn_3, prompt_btn_4],
|
| 502 |
+
)
|
| 503 |
+
send_btn.click(
|
| 504 |
+
on_send,
|
| 505 |
+
inputs=[msg_box, chatbot, file_input, base_dir_state],
|
| 506 |
+
outputs=[chatbot, msg_box, phase_banner, review_table, base_dir_state,
|
| 507 |
+
prompt_btn_1, prompt_btn_2, prompt_btn_3, prompt_btn_4],
|
| 508 |
+
)
|
| 509 |
+
msg_box.submit(
|
| 510 |
+
on_send,
|
| 511 |
+
inputs=[msg_box, chatbot, file_input, base_dir_state],
|
| 512 |
+
outputs=[chatbot, msg_box, phase_banner, review_table, base_dir_state,
|
| 513 |
+
prompt_btn_1, prompt_btn_2, prompt_btn_3, prompt_btn_4],
|
| 514 |
+
)
|
| 515 |
+
prompt_btn_1.click(
|
| 516 |
+
on_send,
|
| 517 |
+
inputs=[prompt_btn_1, chatbot, file_input, base_dir_state],
|
| 518 |
+
outputs=[chatbot, msg_box, phase_banner, review_table, base_dir_state,
|
| 519 |
+
prompt_btn_1, prompt_btn_2, prompt_btn_3, prompt_btn_4],
|
| 520 |
+
)
|
| 521 |
+
prompt_btn_2.click(
|
| 522 |
+
on_send,
|
| 523 |
+
inputs=[prompt_btn_2, chatbot, file_input, base_dir_state],
|
| 524 |
+
outputs=[chatbot, msg_box, phase_banner, review_table, base_dir_state,
|
| 525 |
+
prompt_btn_1, prompt_btn_2, prompt_btn_3, prompt_btn_4],
|
| 526 |
+
)
|
| 527 |
+
prompt_btn_3.click(
|
| 528 |
+
on_send,
|
| 529 |
+
inputs=[prompt_btn_3, chatbot, file_input, base_dir_state],
|
| 530 |
+
outputs=[chatbot, msg_box, phase_banner, review_table, base_dir_state,
|
| 531 |
+
prompt_btn_1, prompt_btn_2, prompt_btn_3, prompt_btn_4],
|
| 532 |
+
)
|
| 533 |
+
prompt_btn_4.click(
|
| 534 |
+
on_send,
|
| 535 |
+
inputs=[prompt_btn_4, chatbot, file_input, base_dir_state],
|
| 536 |
+
outputs=[chatbot, msg_box, phase_banner, review_table, base_dir_state,
|
| 537 |
+
prompt_btn_1, prompt_btn_2, prompt_btn_3, prompt_btn_4],
|
| 538 |
+
)
|
| 539 |
+
clear_btn.click(lambda: EMPTY_TABLE.copy(), outputs=[review_table])
|
| 540 |
+
sub_btn.click(
|
| 541 |
+
on_submit_review,
|
| 542 |
+
inputs=[review_table, chatbot, base_dir_state],
|
| 543 |
+
outputs=[chatbot, phase_banner, review_table,
|
| 544 |
+
prompt_btn_1, prompt_btn_2, prompt_btn_3, prompt_btn_4],
|
| 545 |
+
)
|
| 546 |
+
dl_btn.click(on_download, inputs=[review_table, chatbot], outputs=[dl_files])
|
| 547 |
+
|
| 548 |
+
demo.launch(ssr_mode=False, theme=gr.themes.Soft())
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=6.0.0
|
| 2 |
+
langchain>=1.0.0
|
| 3 |
+
langchain-mistralai>=1.0.0
|
| 4 |
+
langgraph>=1.0.0
|
| 5 |
+
sentence-transformers>=3.0.0
|
| 6 |
+
scikit-learn>=1.4.0
|
| 7 |
+
numpy>=1.26.0
|
| 8 |
+
pandas>=2.1.0
|
| 9 |
+
plotly>=5.20.0
|
| 10 |
+
pyarrow>=15.0.0
|
tools.py
ADDED
|
@@ -0,0 +1,1031 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tools.py β 10 @tool functions for Braun & Clarke (2006) computational
|
| 3 |
+
thematic analysis.
|
| 4 |
+
|
| 5 |
+
Pipeline (called in this order by the LLM agent):
|
| 6 |
+
|
| 7 |
+
1. load_scopus_csv β ingest CSV, strip boilerplate, save .parquet
|
| 8 |
+
2. run_bertopic_discovery β embed β cosine agglomerative cluster (min 3
|
| 9 |
+
members) β centroids β orphan report β 4 charts
|
| 10 |
+
3. label_topics_with_llm β Mistral labels top 100 clusters
|
| 11 |
+
4. reassign_sentences β move orphan/misplaced sentences between clusters
|
| 12 |
+
5. consolidate_into_themes β merge reviewer-approved groups
|
| 13 |
+
6. compute_saturation β coverage %, coherence, balance per theme
|
| 14 |
+
7. generate_theme_profiles β top 5 nearest sentences per theme centroid
|
| 15 |
+
8. compare_with_taxonomy β map themes to PAJAIS 25 categories
|
| 16 |
+
9. generate_comparison_csv β abstract vs title side-by-side
|
| 17 |
+
10. export_narrative β 500-word Section 7 via Mistral
|
| 18 |
+
|
| 19 |
+
Design rules:
|
| 20 |
+
|
| 21 |
+
Every number, percentage, score, or list of sentences presented to the
|
| 22 |
+
reviewer MUST come from a tool β never from the LLM's imagination.
|
| 23 |
+
|
| 24 |
+
Deterministic tools (1,2,4,5,6,7,9): same input β same output, every run.
|
| 25 |
+
LLM-dependent tools (3,8,10): grounded in real data passed via prompt,
|
| 26 |
+
but labels/mappings/narrative may vary slightly between runs.
|
| 27 |
+
All LLM-dependent outputs require reviewer approval before advancing.
|
| 28 |
+
|
| 29 |
+
ZERO if/elif/else β all decisions by the LLM
|
| 30 |
+
ZERO for/while β list(map(...)) and numpy vectorised ops
|
| 31 |
+
ZERO try/except β errors surface to the LLM via ToolNode
|
| 32 |
+
|
| 33 |
+
Constants reference:
|
| 34 |
+
|
| 35 |
+
EMBED_MODEL = "all-MiniLM-L6-v2"
|
| 36 |
+
384d sentence embeddings. Runs locally, no API calls.
|
| 37 |
+
normalize_embeddings=True β cosine similarity = dot product.
|
| 38 |
+
|
| 39 |
+
CLUSTER_THRESHOLD = 0.50
|
| 40 |
+
Cosine distance threshold for Agglomerative Clustering.
|
| 41 |
+
Two sentences must have cosine similarity >= 0.50 to share a code.
|
| 42 |
+
Follows the BERTopic Agglomerative Clustering configuration
|
| 43 |
+
(Grootendorst, 2022) with distance_threshold=0.5 as documented
|
| 44 |
+
in the BERTopic framework. Operationalises Braun & Clarke (2006)
|
| 45 |
+
Phase 2 'Generating Initial Codes' as a reproducible computation.
|
| 46 |
+
|
| 47 |
+
Tighter (e.g. 0.40) β more, finer codes (closer to B&C ideal)
|
| 48 |
+
Looser (e.g. 0.60) β fewer, broader codes
|
| 49 |
+
At 0.50 β balanced granularity following BERTopic docs example.
|
| 50 |
+
|
| 51 |
+
MIN_CLUSTER_SIZE = 3
|
| 52 |
+
Clusters with fewer than 3 members are dissolved. Their sentences
|
| 53 |
+
become orphans (label=-1) reported to the reviewer for reassignment.
|
| 54 |
+
|
| 55 |
+
N_CENTROIDS = 200
|
| 56 |
+
Maximum number of clusters saved to summaries.json (and therefore
|
| 57 |
+
labelled and shown in the review table). Set high enough to capture
|
| 58 |
+
all clusters in typical Scopus datasets (1k-5k papers).
|
| 59 |
+
Top clusters extracted for initial discovery report and charts.
|
| 60 |
+
|
| 61 |
+
TOP_TOPICS_LLM = 100
|
| 62 |
+
Maximum clusters sent to Mistral for labelling.
|
| 63 |
+
|
| 64 |
+
NARRATIVE_WORDS = 500
|
| 65 |
+
Target word count for Section 7 narrative.
|
| 66 |
+
|
| 67 |
+
PAJAIS_25
|
| 68 |
+
25 IS research categories from Jiang et al. (2019).
|
| 69 |
+
Used in Phase 5.5 for taxonomy alignment.
|
| 70 |
+
|
| 71 |
+
BOILERPLATE_PATTERNS (9 regexes)
|
| 72 |
+
Strip publisher noise: copyright, DOI, Elsevier, Springer,
|
| 73 |
+
IEEE, Wiley, Taylor & Francis.
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
from __future__ import annotations
|
| 77 |
+
|
| 78 |
+
import json
|
| 79 |
+
import re
|
| 80 |
+
import numpy as np
|
| 81 |
+
import pandas as pd
|
| 82 |
+
import plotly.graph_objects as go
|
| 83 |
+
|
| 84 |
+
from pathlib import Path
|
| 85 |
+
from langchain_core.tools import tool
|
| 86 |
+
from langchain_mistralai import ChatMistralAI
|
| 87 |
+
from langchain_core.prompts import PromptTemplate
|
| 88 |
+
from langchain_core.output_parsers import JsonOutputParser
|
| 89 |
+
from sentence_transformers import SentenceTransformer
|
| 90 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 91 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 92 |
+
from sklearn.preprocessing import normalize
|
| 93 |
+
from sklearn.decomposition import PCA
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
RUN_CONFIGS = {
|
| 97 |
+
"abstract": ["Abstract"],
|
| 98 |
+
"title": ["Title"],
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
PAJAIS_25 = [
|
| 102 |
+
"Accounting Information Systems",
|
| 103 |
+
"Artificial Intelligence & Expert Systems",
|
| 104 |
+
"Big Data & Analytics",
|
| 105 |
+
"Business Intelligence & Decision Support",
|
| 106 |
+
"Cloud Computing",
|
| 107 |
+
"Cybersecurity & Privacy",
|
| 108 |
+
"Database Management",
|
| 109 |
+
"Digital Transformation",
|
| 110 |
+
"E-Business & E-Commerce",
|
| 111 |
+
"Enterprise Resource Planning",
|
| 112 |
+
"Fintech & Digital Finance",
|
| 113 |
+
"Geographic Information Systems",
|
| 114 |
+
"Health Informatics",
|
| 115 |
+
"Human-Computer Interaction",
|
| 116 |
+
"Information Systems Development",
|
| 117 |
+
"IT Governance & Management",
|
| 118 |
+
"IT Strategy & Competitive Advantage",
|
| 119 |
+
"Knowledge Management",
|
| 120 |
+
"Machine Learning & Deep Learning",
|
| 121 |
+
"Mobile Computing",
|
| 122 |
+
"Natural Language Processing",
|
| 123 |
+
"Recommender Systems",
|
| 124 |
+
"Social Media & Web 2.0",
|
| 125 |
+
"Supply Chain & Logistics IS",
|
| 126 |
+
"Virtual Reality & Augmented Reality",
|
| 127 |
+
]
|
| 128 |
+
|
| 129 |
+
BOILERPLATE_PATTERNS = [
|
| 130 |
+
r"Β©\s*\d{4}",
|
| 131 |
+
r"all rights reserved",
|
| 132 |
+
r"published by elsevier",
|
| 133 |
+
r"this article is protected",
|
| 134 |
+
r"doi:\s*10\.\d{4,}",
|
| 135 |
+
r"springer nature",
|
| 136 |
+
r"ieee xplore",
|
| 137 |
+
r"wiley online library",
|
| 138 |
+
r"taylor & francis",
|
| 139 |
+
]
|
| 140 |
+
|
| 141 |
+
BOILERPLATE_RE = re.compile("|".join(BOILERPLATE_PATTERNS), flags=re.IGNORECASE)
|
| 142 |
+
SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
|
| 143 |
+
EMBED_MODEL = "all-MiniLM-L6-v2"
|
| 144 |
+
N_CENTROIDS = 200
|
| 145 |
+
CLUSTER_THRESHOLD = 0.50
|
| 146 |
+
MIN_CLUSTER_SIZE = 5
|
| 147 |
+
TOP_TOPICS_LLM = 100
|
| 148 |
+
NARRATIVE_WORDS = 500
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def _clean_text(text: str) -> str:
|
| 152 |
+
"""Remove publisher boilerplate from a single text string.
|
| 153 |
+
|
| 154 |
+
Applies 9-pattern BOILERPLATE_RE regex to strip copyright notices,
|
| 155 |
+
DOI prefixes, and publisher tags that would pollute embeddings.
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
text: Raw abstract or title string.
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
Cleaned string with boilerplate removed and whitespace trimmed.
|
| 162 |
+
"""
|
| 163 |
+
return BOILERPLATE_RE.sub("", str(text)).strip()
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _sentence_count(text: str) -> int:
|
| 167 |
+
"""Count sentences using regex split on terminal punctuation.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
text: Cleaned abstract or title text.
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
Number of sentences (minimum 1 for any non-empty input).
|
| 174 |
+
"""
|
| 175 |
+
return len(SENTENCE_SPLIT_RE.split(text.strip()))
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _embed(texts: list[str]) -> np.ndarray:
|
| 179 |
+
"""Embed texts into 384d L2-normalized unit vectors.
|
| 180 |
+
|
| 181 |
+
Uses SentenceTransformer('all-MiniLM-L6-v2') locally β no API calls.
|
| 182 |
+
normalize_embeddings=True ensures cosine_similarity = dot product.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
texts: List of N cleaned text strings.
|
| 186 |
+
|
| 187 |
+
Returns:
|
| 188 |
+
np.ndarray shape (N, 384), dtype float32, L2-normalized.
|
| 189 |
+
"""
|
| 190 |
+
model = SentenceTransformer(EMBED_MODEL)
|
| 191 |
+
raw = model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
|
| 192 |
+
return np.array(raw, dtype=np.float32)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def _cosine_cluster(matrix: np.ndarray, threshold: float, min_size: int) -> np.ndarray:
|
| 196 |
+
"""Cluster embeddings using agglomerative cosine clustering.
|
| 197 |
+
|
| 198 |
+
Works DIRECTLY in 384d space β no UMAP. After clustering, any cluster
|
| 199 |
+
with fewer than min_size members is dissolved: its sentences get
|
| 200 |
+
label=-1 (orphan) and are reported to the reviewer for reassignment.
|
| 201 |
+
|
| 202 |
+
Algorithm:
|
| 203 |
+
1. Start: every text is its own cluster.
|
| 204 |
+
2. Merge the two closest clusters (average cosine distance).
|
| 205 |
+
3. Repeat until smallest distance exceeds threshold.
|
| 206 |
+
4. Post-process: dissolve clusters smaller than min_size.
|
| 207 |
+
|
| 208 |
+
Args:
|
| 209 |
+
matrix: (N, 384) embedding matrix, L2-normalized.
|
| 210 |
+
threshold: Max cosine distance for merging (0.7 β ~100 clusters).
|
| 211 |
+
min_size: Minimum members per cluster (3). Smaller β orphan.
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
np.ndarray shape (N,) with integer labels. -1 = orphan.
|
| 215 |
+
"""
|
| 216 |
+
normed = normalize(matrix, norm="l2")
|
| 217 |
+
model = AgglomerativeClustering(
|
| 218 |
+
n_clusters=None,
|
| 219 |
+
metric="cosine",
|
| 220 |
+
linkage="average",
|
| 221 |
+
distance_threshold=threshold,
|
| 222 |
+
)
|
| 223 |
+
labels = model.fit_predict(normed).astype(int)
|
| 224 |
+
unique, counts = np.unique(labels, return_counts=True)
|
| 225 |
+
small_clusters = unique[counts < min_size]
|
| 226 |
+
return np.where(np.isin(labels, small_clusters), -1, labels)
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def _centroid(vecs: np.ndarray) -> np.ndarray:
|
| 230 |
+
"""Compute L2-normalized centroid (average direction in 384d space).
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
vecs: (M, 384) matrix of member embeddings for one cluster.
|
| 234 |
+
|
| 235 |
+
Returns:
|
| 236 |
+
1d np.ndarray shape (384,), L2-normalized.
|
| 237 |
+
"""
|
| 238 |
+
return normalize(vecs.mean(axis=0, keepdims=True), norm="l2")[0]
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def _top_n_centroids(matrix: np.ndarray, labels: np.ndarray, n: int) -> list[dict]:
|
| 242 |
+
"""Extract N largest clusters by size and compute their centroids.
|
| 243 |
+
|
| 244 |
+
Excludes orphans (label=-1) from the ranking.
|
| 245 |
+
|
| 246 |
+
Args:
|
| 247 |
+
matrix: (N, 384) full embedding matrix.
|
| 248 |
+
labels: (N,) integer cluster labels (-1 = orphan).
|
| 249 |
+
n: How many top clusters to return.
|
| 250 |
+
|
| 251 |
+
Returns:
|
| 252 |
+
List of N dicts with: label, size, indices, centroid.
|
| 253 |
+
"""
|
| 254 |
+
valid_mask = labels >= 0
|
| 255 |
+
valid_labels = labels[valid_mask]
|
| 256 |
+
unique, counts = np.unique(valid_labels, return_counts=True)
|
| 257 |
+
order = np.argsort(counts)[::-1][:n]
|
| 258 |
+
top_labels = unique[order]
|
| 259 |
+
|
| 260 |
+
def _build(lbl: int) -> dict:
|
| 261 |
+
"""Build summary dict for one cluster."""
|
| 262 |
+
idx = np.where(labels == lbl)[0].tolist()
|
| 263 |
+
return {
|
| 264 |
+
"label": int(lbl),
|
| 265 |
+
"size": len(idx),
|
| 266 |
+
"indices": idx,
|
| 267 |
+
"centroid": _centroid(matrix[idx]),
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
return list(map(_build, top_labels))
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def _mistral_chain(template_str: str):
|
| 274 |
+
"""Create PromptTemplate β ChatMistralAI β JsonOutputParser chain.
|
| 275 |
+
|
| 276 |
+
Args:
|
| 277 |
+
template_str: Prompt template with {variable} placeholders.
|
| 278 |
+
|
| 279 |
+
Returns:
|
| 280 |
+
LangChain Runnable chain that accepts dict and returns parsed JSON.
|
| 281 |
+
"""
|
| 282 |
+
llm = ChatMistralAI(model="mistral-large-latest", temperature=0)
|
| 283 |
+
prompt = PromptTemplate.from_template(template_str)
|
| 284 |
+
return prompt | llm | JsonOutputParser()
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def _dark_layout(title: str) -> dict:
|
| 288 |
+
"""Return Plotly layout dict with dark theme styling.
|
| 289 |
+
|
| 290 |
+
Args:
|
| 291 |
+
title: Chart title string.
|
| 292 |
+
|
| 293 |
+
Returns:
|
| 294 |
+
Dict for fig.update_layout(**_dark_layout("...")).
|
| 295 |
+
"""
|
| 296 |
+
return dict(
|
| 297 |
+
title=title, paper_bgcolor="#0F172A", plot_bgcolor="#0F172A",
|
| 298 |
+
font=dict(color="#CBD5E1", family="Sora,sans-serif"),
|
| 299 |
+
margin=dict(t=50, b=40, l=40, r=20),
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
@tool
|
| 304 |
+
def load_scopus_csv(csv_path: str, run_mode: str = "abstract") -> str:
|
| 305 |
+
"""Load a Scopus CSV, count papers/sentences, apply boilerplate filter.
|
| 306 |
+
|
| 307 |
+
Phase 1 β Familiarisation with the Data. DETERMINISTIC.
|
| 308 |
+
|
| 309 |
+
Steps:
|
| 310 |
+
1. Read CSV, drop rows where target column is null
|
| 311 |
+
2. Apply 9-pattern boilerplate regex to clean each text
|
| 312 |
+
3. Count sentences per paper
|
| 313 |
+
4. Save cleaned DataFrame as .parquet
|
| 314 |
+
|
| 315 |
+
Args:
|
| 316 |
+
csv_path: Path to raw Scopus CSV.
|
| 317 |
+
run_mode: 'abstract' or 'title'.
|
| 318 |
+
|
| 319 |
+
Returns:
|
| 320 |
+
JSON: total_papers, total_sentences, columns_used,
|
| 321 |
+
boilerplate_removed, cleaned_parquet, run_mode.
|
| 322 |
+
"""
|
| 323 |
+
cols = RUN_CONFIGS[run_mode]
|
| 324 |
+
target = cols[0]
|
| 325 |
+
|
| 326 |
+
df = pd.read_csv(csv_path).dropna(subset=[target]).reset_index(drop=True)
|
| 327 |
+
raw_texts = df[target].tolist()
|
| 328 |
+
cleaned_texts = list(map(_clean_text, raw_texts))
|
| 329 |
+
|
| 330 |
+
boilerplate_removed = sum(map(
|
| 331 |
+
lambda pair: int(pair[0] != pair[1]),
|
| 332 |
+
zip(raw_texts, cleaned_texts),
|
| 333 |
+
))
|
| 334 |
+
|
| 335 |
+
df[f"{target}_clean"] = cleaned_texts
|
| 336 |
+
df["sentence_count"] = list(map(_sentence_count, cleaned_texts))
|
| 337 |
+
|
| 338 |
+
out_path = Path(csv_path).with_suffix(".clean.parquet")
|
| 339 |
+
df.to_parquet(out_path, index=False)
|
| 340 |
+
|
| 341 |
+
return json.dumps({
|
| 342 |
+
"total_papers": len(df),
|
| 343 |
+
"total_sentences": int(df["sentence_count"].sum()),
|
| 344 |
+
"columns_used": cols,
|
| 345 |
+
"boilerplate_removed": boilerplate_removed,
|
| 346 |
+
"cleaned_parquet": str(out_path),
|
| 347 |
+
"run_mode": run_mode,
|
| 348 |
+
}, indent=2)
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
@tool
|
| 352 |
+
def run_bertopic_discovery(parquet_path: str, run_mode: str = "abstract") -> str:
|
| 353 |
+
"""Embed texts, cluster them, report orphans, generate charts.
|
| 354 |
+
|
| 355 |
+
Phase 2 β Generating Initial Codes. DETERMINISTIC.
|
| 356 |
+
|
| 357 |
+
Steps:
|
| 358 |
+
1. Load cleaned parquet, drop Author Keywords columns (RULE 8)
|
| 359 |
+
2. Embed all texts β N x 384 matrix of unit vectors
|
| 360 |
+
3. Save embedding matrix as .emb.npy
|
| 361 |
+
4. Cluster in 384d space (NO UMAP), min 3 members per cluster
|
| 362 |
+
5. Sentences in clusters < 3 members become orphans (label=-1)
|
| 363 |
+
6. Extract top-N clusters by size, compute centroids
|
| 364 |
+
7. Save summaries.json with clusters + orphan list
|
| 365 |
+
8. Generate 4 Plotly HTML charts
|
| 366 |
+
|
| 367 |
+
Args:
|
| 368 |
+
parquet_path: Path to .clean.parquet from load_scopus_csv.
|
| 369 |
+
run_mode: 'abstract' or 'title'.
|
| 370 |
+
|
| 371 |
+
Returns:
|
| 372 |
+
JSON: total_clusters, orphan_count, summaries_json, embeddings_npy,
|
| 373 |
+
charts dict.
|
| 374 |
+
"""
|
| 375 |
+
cols = RUN_CONFIGS[run_mode]
|
| 376 |
+
target = f"{cols[0]}_clean"
|
| 377 |
+
|
| 378 |
+
df = pd.read_parquet(parquet_path).drop(
|
| 379 |
+
columns=[c for c in pd.read_parquet(parquet_path).columns
|
| 380 |
+
if re.search(r"keyword|author", c, re.I)],
|
| 381 |
+
errors="ignore",
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
paper_texts = df[target].tolist()
|
| 385 |
+
|
| 386 |
+
sentence_records = list(filter(
|
| 387 |
+
lambda r: len(r["text"].split()) >= 5,
|
| 388 |
+
[
|
| 389 |
+
{"paper_idx": paper_i, "sent_idx": sent_i, "text": sent.strip()}
|
| 390 |
+
for paper_i, paper_text in enumerate(paper_texts)
|
| 391 |
+
for sent_i, sent in enumerate(SENTENCE_SPLIT_RE.split(paper_text or ""))
|
| 392 |
+
if sent.strip()
|
| 393 |
+
],
|
| 394 |
+
))
|
| 395 |
+
|
| 396 |
+
texts = list(map(lambda r: r["text"], sentence_records))
|
| 397 |
+
paper_idx = list(map(lambda r: r["paper_idx"], sentence_records))
|
| 398 |
+
embeddings = _embed(texts)
|
| 399 |
+
base = Path(parquet_path).parent
|
| 400 |
+
|
| 401 |
+
np.save(str(base / Path(parquet_path).stem) + ".emb.npy", embeddings)
|
| 402 |
+
|
| 403 |
+
labels = _cosine_cluster(embeddings, CLUSTER_THRESHOLD, MIN_CLUSTER_SIZE)
|
| 404 |
+
orphan_idx = np.where(labels == -1)[0].tolist()
|
| 405 |
+
orphan_count = len(orphan_idx)
|
| 406 |
+
valid_count = int((labels >= 0).sum())
|
| 407 |
+
n_clusters = int(np.unique(labels[labels >= 0]).shape[0])
|
| 408 |
+
n_papers = len(set(paper_idx))
|
| 409 |
+
n_sentences = len(texts)
|
| 410 |
+
top_centroids = _top_n_centroids(embeddings, labels, N_CENTROIDS)
|
| 411 |
+
|
| 412 |
+
def _topic_row(tc: dict) -> dict:
|
| 413 |
+
"""Convert centroid dict into summary row for summaries.json."""
|
| 414 |
+
return {
|
| 415 |
+
"topic_id": tc["label"],
|
| 416 |
+
"size": tc["size"],
|
| 417 |
+
"representative": texts[tc["indices"][0]][:200],
|
| 418 |
+
"indices": tc["indices"],
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
summaries = list(map(_topic_row, top_centroids))
|
| 422 |
+
|
| 423 |
+
orphans = list(map(
|
| 424 |
+
lambda i: {"sentence_idx": int(i), "text": texts[i][:200]},
|
| 425 |
+
orphan_idx,
|
| 426 |
+
))
|
| 427 |
+
|
| 428 |
+
output = {"clusters": summaries, "orphans": orphans}
|
| 429 |
+
(base / "summaries.json").write_text(json.dumps(output, indent=2))
|
| 430 |
+
|
| 431 |
+
unique, counts = np.unique(labels[labels >= 0], return_counts=True)
|
| 432 |
+
order = np.argsort(counts)[::-1][:20]
|
| 433 |
+
c1 = go.Figure(go.Bar(
|
| 434 |
+
x=list(map(str, unique[order])), y=counts[order].tolist(),
|
| 435 |
+
marker_color="#3B82F6", text=counts[order].tolist(), textposition="outside",
|
| 436 |
+
))
|
| 437 |
+
c1.update_layout(**_dark_layout("Topic Size Distribution (Top 20)"),
|
| 438 |
+
xaxis=dict(showgrid=False),
|
| 439 |
+
yaxis=dict(showgrid=True, gridcolor="#1E293B"))
|
| 440 |
+
c1.write_html(str(base / "chart_topic_sizes.html"))
|
| 441 |
+
|
| 442 |
+
centroid_matrix = np.vstack([tc["centroid"] for tc in top_centroids])
|
| 443 |
+
sim_matrix = cosine_similarity(centroid_matrix)
|
| 444 |
+
clabels = list(map(lambda tc: f"T{tc['label']}", top_centroids))
|
| 445 |
+
c2 = go.Figure(go.Heatmap(z=sim_matrix, x=clabels, y=clabels, colorscale="Blues"))
|
| 446 |
+
c2.update_layout(**_dark_layout("Top-5 Centroid Cosine Similarity"))
|
| 447 |
+
c2.write_html(str(base / "chart_centroid_heatmap.html"))
|
| 448 |
+
|
| 449 |
+
sc = df.get("sentence_count", pd.Series([0] * len(df))).tolist()
|
| 450 |
+
c3 = go.Figure(go.Histogram(x=sc, nbinsx=40, marker_color="#22D3EE"))
|
| 451 |
+
c3.update_layout(**_dark_layout("Sentence Count Distribution"),
|
| 452 |
+
xaxis=dict(showgrid=False),
|
| 453 |
+
yaxis=dict(showgrid=True, gridcolor="#1E293B"))
|
| 454 |
+
c3.write_html(str(base / "chart_sentence_distribution.html"))
|
| 455 |
+
|
| 456 |
+
coords = PCA(n_components=2).fit_transform(centroid_matrix)
|
| 457 |
+
point_text = list(map(lambda tc: f"T{tc['label']}({tc['size']})", top_centroids))
|
| 458 |
+
c4 = go.Figure(go.Scatter(
|
| 459 |
+
x=coords[:, 0].tolist(), y=coords[:, 1].tolist(),
|
| 460 |
+
mode="markers+text", text=point_text, textposition="top center",
|
| 461 |
+
marker=dict(size=12, color="#F59E0B", line=dict(width=1, color="#0F172A")),
|
| 462 |
+
))
|
| 463 |
+
c4.update_layout(**_dark_layout("Top-5 Centroids β PCA Projection"))
|
| 464 |
+
c4.write_html(str(base / "chart_centroid_pca.html"))
|
| 465 |
+
|
| 466 |
+
emb_path = str(base / Path(parquet_path).stem) + ".emb.npy"
|
| 467 |
+
return json.dumps({
|
| 468 |
+
"total_clusters": n_clusters,
|
| 469 |
+
"orphan_count": orphan_count,
|
| 470 |
+
"valid_sentences": valid_count,
|
| 471 |
+
"total_sentences": n_sentences,
|
| 472 |
+
"total_papers": n_papers,
|
| 473 |
+
"top_centroids": N_CENTROIDS,
|
| 474 |
+
"summaries_json": str(base / "summaries.json"),
|
| 475 |
+
"embeddings_npy": emb_path,
|
| 476 |
+
"needs_review": True,
|
| 477 |
+
"charts": {
|
| 478 |
+
"topic_sizes": str(base / "chart_topic_sizes.html"),
|
| 479 |
+
"centroid_heatmap": str(base / "chart_centroid_heatmap.html"),
|
| 480 |
+
"sentence_dist": str(base / "chart_sentence_distribution.html"),
|
| 481 |
+
"centroid_pca": str(base / "chart_centroid_pca.html"),
|
| 482 |
+
},
|
| 483 |
+
}, indent=2)
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
@tool
|
| 487 |
+
def label_topics_with_llm(summaries_json_path: str) -> str:
|
| 488 |
+
"""Send top-100 topic summaries to Mistral for labelling.
|
| 489 |
+
|
| 490 |
+
Phase 2 β Naming Initial Codes. LLM-DEPENDENT (grounded in real data extracts).
|
| 491 |
+
|
| 492 |
+
Steps:
|
| 493 |
+
1. Load summaries.json clusters (not orphans)
|
| 494 |
+
2. Take top 100 by size
|
| 495 |
+
3. Mistral reads representative sentences β assigns labels
|
| 496 |
+
4. Returns: topic_id, label, rationale, confidence per cluster
|
| 497 |
+
5. Save as topic_labels.json
|
| 498 |
+
|
| 499 |
+
Args:
|
| 500 |
+
summaries_json_path: Path to summaries.json.
|
| 501 |
+
|
| 502 |
+
Returns:
|
| 503 |
+
JSON: labelled_topics count + output path. needs_review=True.
|
| 504 |
+
"""
|
| 505 |
+
data = json.loads(Path(summaries_json_path).read_text())
|
| 506 |
+
summaries = data.get("clusters", data)[:TOP_TOPICS_LLM]
|
| 507 |
+
|
| 508 |
+
template = (
|
| 509 |
+
"You are a scientific topic labelling expert.\n\n"
|
| 510 |
+
"Below are {n} topic summaries from a BERTopic analysis of academic papers.\n"
|
| 511 |
+
"Each summary has: topic_id, size, representative text.\n\n"
|
| 512 |
+
"{summaries}\n\n"
|
| 513 |
+
"For EACH topic return a JSON array where every element has:\n"
|
| 514 |
+
" topic_id : integer (copy from input)\n"
|
| 515 |
+
" label : 2-5 word snake_case topic label\n"
|
| 516 |
+
" rationale : one sentence justification\n"
|
| 517 |
+
" confidence : float 0.0-1.0\n\n"
|
| 518 |
+
"Return ONLY the JSON array β no markdown, no preamble."
|
| 519 |
+
)
|
| 520 |
+
|
| 521 |
+
result = _mistral_chain(template).invoke({
|
| 522 |
+
"n": len(summaries),
|
| 523 |
+
"summaries": json.dumps(summaries, indent=2),
|
| 524 |
+
})
|
| 525 |
+
out_path = Path(summaries_json_path).parent / "topic_labels.json"
|
| 526 |
+
out_path.write_text(json.dumps(result, indent=2))
|
| 527 |
+
|
| 528 |
+
return json.dumps({
|
| 529 |
+
"labelled_topics": len(result),
|
| 530 |
+
"output": str(out_path),
|
| 531 |
+
"needs_review": True,
|
| 532 |
+
}, indent=2)
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
@tool
|
| 536 |
+
def reassign_sentences(
|
| 537 |
+
summaries_json_path: str,
|
| 538 |
+
embeddings_npy_path: str,
|
| 539 |
+
move_instructions: list[dict],
|
| 540 |
+
) -> str:
|
| 541 |
+
"""Move orphan or misplaced sentences between clusters.
|
| 542 |
+
|
| 543 |
+
Phase 2 β Reassigning orphan data extracts. DETERMINISTIC.
|
| 544 |
+
|
| 545 |
+
The reviewer specifies moves as a list of dicts:
|
| 546 |
+
[{"sentence_idx": 42, "to_cluster": 3},
|
| 547 |
+
{"sentence_idx": 99, "to_cluster": "new"}]
|
| 548 |
+
|
| 549 |
+
For "new" targets, a fresh cluster ID is assigned.
|
| 550 |
+
After all moves, centroids are recomputed for affected clusters.
|
| 551 |
+
|
| 552 |
+
Steps:
|
| 553 |
+
1. Load summaries.json and embeddings
|
| 554 |
+
2. Apply move instructions
|
| 555 |
+
3. Update cluster assignments
|
| 556 |
+
4. Recompute centroids for affected clusters
|
| 557 |
+
5. Save updated summaries.json
|
| 558 |
+
|
| 559 |
+
Args:
|
| 560 |
+
summaries_json_path: Path to summaries.json.
|
| 561 |
+
embeddings_npy_path: Path to .emb.npy.
|
| 562 |
+
move_instructions: List of dicts with sentence_idx (int) and
|
| 563 |
+
to_cluster (int or "new") keys.
|
| 564 |
+
|
| 565 |
+
Returns:
|
| 566 |
+
JSON: moves_applied count, orphans_remaining, updated summaries path.
|
| 567 |
+
"""
|
| 568 |
+
data = json.loads(Path(summaries_json_path).read_text())
|
| 569 |
+
embeddings = np.load(embeddings_npy_path)
|
| 570 |
+
moves = move_instructions
|
| 571 |
+
clusters = data.get("clusters", [])
|
| 572 |
+
orphans = data.get("orphans", [])
|
| 573 |
+
|
| 574 |
+
all_indices = {}
|
| 575 |
+
list(map(
|
| 576 |
+
lambda c: all_indices.update({idx: c["topic_id"] for idx in c.get("indices", [])}),
|
| 577 |
+
clusters,
|
| 578 |
+
))
|
| 579 |
+
|
| 580 |
+
max_id = max(map(lambda c: c.get("topic_id", 0), clusters), default=0)
|
| 581 |
+
new_id_counter = [max_id + 1]
|
| 582 |
+
|
| 583 |
+
def _apply_move(m: dict) -> dict:
|
| 584 |
+
"""Apply one move instruction, return the resolved target cluster ID."""
|
| 585 |
+
s_idx = m["sentence_idx"]
|
| 586 |
+
target = m["to_cluster"]
|
| 587 |
+
resolved = (target == "new") and new_id_counter.__setitem__(0, new_id_counter[0] + 1) or target
|
| 588 |
+
final_id = new_id_counter[0] - 1 * (target == "new") + target * (target != "new")
|
| 589 |
+
all_indices[s_idx] = int(target) * (target != "new") + new_id_counter[0] * (target == "new")
|
| 590 |
+
return {"sentence_idx": s_idx, "assigned_to": all_indices[s_idx]}
|
| 591 |
+
|
| 592 |
+
applied = list(map(_apply_move, moves))
|
| 593 |
+
|
| 594 |
+
unique_clusters = set(all_indices.values())
|
| 595 |
+
|
| 596 |
+
def _rebuild_cluster(cid: int) -> dict:
|
| 597 |
+
"""Rebuild a cluster dict from the updated index map."""
|
| 598 |
+
idx = [k for k, v in all_indices.items() if v == cid]
|
| 599 |
+
vecs = embeddings[idx or [0]]
|
| 600 |
+
return {
|
| 601 |
+
"topic_id": int(cid),
|
| 602 |
+
"size": len(idx),
|
| 603 |
+
"representative": "",
|
| 604 |
+
"indices": idx,
|
| 605 |
+
"centroid": _centroid(vecs).tolist(),
|
| 606 |
+
}
|
| 607 |
+
|
| 608 |
+
updated_clusters = list(map(_rebuild_cluster, sorted(unique_clusters)))
|
| 609 |
+
remaining_orphan_idx = [o["sentence_idx"] for o in orphans
|
| 610 |
+
if o["sentence_idx"] not in all_indices]
|
| 611 |
+
|
| 612 |
+
output = {
|
| 613 |
+
"clusters": updated_clusters,
|
| 614 |
+
"orphans": list(map(
|
| 615 |
+
lambda i: {"sentence_idx": i, "text": ""},
|
| 616 |
+
remaining_orphan_idx,
|
| 617 |
+
)),
|
| 618 |
+
}
|
| 619 |
+
Path(summaries_json_path).write_text(json.dumps(output, indent=2))
|
| 620 |
+
|
| 621 |
+
return json.dumps({
|
| 622 |
+
"moves_applied": len(applied),
|
| 623 |
+
"orphans_remaining": len(remaining_orphan_idx),
|
| 624 |
+
"summaries_json": summaries_json_path,
|
| 625 |
+
"needs_review": True,
|
| 626 |
+
}, indent=2)
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
@tool
|
| 630 |
+
def consolidate_into_themes(
|
| 631 |
+
labels_json_path: str,
|
| 632 |
+
embeddings_npy_path: str,
|
| 633 |
+
approved_topic_ids: list[list[int]],
|
| 634 |
+
) -> str:
|
| 635 |
+
"""Merge approved topic groups into consolidated themes.
|
| 636 |
+
|
| 637 |
+
Phase 3 β Searching for Themes. DETERMINISTIC.
|
| 638 |
+
|
| 639 |
+
Steps:
|
| 640 |
+
1. Load topic_labels.json and embedding matrix
|
| 641 |
+
2. Pool all member embeddings per group
|
| 642 |
+
3. Compute fresh L2-normalized centroid per merged group
|
| 643 |
+
4. Build theme name from joined sub-labels
|
| 644 |
+
5. Save themes.json
|
| 645 |
+
|
| 646 |
+
Args:
|
| 647 |
+
labels_json_path: Path to topic_labels.json.
|
| 648 |
+
embeddings_npy_path: Path to .emb.npy.
|
| 649 |
+
approved_topic_ids: List of lists of initial-code IDs.
|
| 650 |
+
Each inner list is one candidate theme.
|
| 651 |
+
Example: [[0,1,2],[3,4],[5]] creates 3
|
| 652 |
+
candidate themes from 6 initial codes.
|
| 653 |
+
|
| 654 |
+
Returns:
|
| 655 |
+
JSON: themes_created count + themes_json path. needs_review=True.
|
| 656 |
+
"""
|
| 657 |
+
labels_data = json.loads(Path(labels_json_path).read_text())
|
| 658 |
+
embeddings = np.load(embeddings_npy_path)
|
| 659 |
+
groups = approved_topic_ids
|
| 660 |
+
label_map = {item["topic_id"]: item for item in labels_data}
|
| 661 |
+
|
| 662 |
+
def _merge_group(group_ids: list[int]) -> dict:
|
| 663 |
+
"""Merge topic IDs into one theme, recompute centroid."""
|
| 664 |
+
members = [m for m in map(label_map.get, group_ids) if m is not None]
|
| 665 |
+
all_idx = sum(map(lambda m: m.get("indices", []), members), [])
|
| 666 |
+
vecs = embeddings[all_idx or [0]]
|
| 667 |
+
centroid = _centroid(vecs)
|
| 668 |
+
sub_labels = list(map(lambda m: m.get("label", ""), members))
|
| 669 |
+
theme_name = "_".join(
|
| 670 |
+
dict.fromkeys(sum(map(lambda lbl: lbl.split("_"), sub_labels), []))
|
| 671 |
+
)[:60]
|
| 672 |
+
return {
|
| 673 |
+
"theme_id": group_ids[0],
|
| 674 |
+
"theme_label": theme_name,
|
| 675 |
+
"merged_ids": group_ids,
|
| 676 |
+
"total_papers": len(set(all_idx)),
|
| 677 |
+
"indices": all_idx,
|
| 678 |
+
"centroid": centroid.tolist(),
|
| 679 |
+
}
|
| 680 |
+
|
| 681 |
+
themes = list(map(_merge_group, groups))
|
| 682 |
+
out_path = Path(labels_json_path).parent / "themes.json"
|
| 683 |
+
out_path.write_text(json.dumps(themes, indent=2))
|
| 684 |
+
|
| 685 |
+
return json.dumps({
|
| 686 |
+
"themes_created": len(themes),
|
| 687 |
+
"themes_json": str(out_path),
|
| 688 |
+
"needs_review": True,
|
| 689 |
+
}, indent=2)
|
| 690 |
+
|
| 691 |
+
|
| 692 |
+
@tool
|
| 693 |
+
def compute_saturation(
|
| 694 |
+
themes_json_path: str,
|
| 695 |
+
embeddings_npy_path: str,
|
| 696 |
+
total_papers: int,
|
| 697 |
+
) -> str:
|
| 698 |
+
"""Compute saturation metrics per theme: coverage, coherence, balance.
|
| 699 |
+
|
| 700 |
+
Phase 4 β Reviewing Themes. DETERMINISTIC.
|
| 701 |
+
|
| 702 |
+
Every number in the output is computed by numpy β the LLM never
|
| 703 |
+
calculates these values. This eliminates hallucination risk for
|
| 704 |
+
percentages, scores, and ratios.
|
| 705 |
+
|
| 706 |
+
Metrics per theme:
|
| 707 |
+
coverage = papers_in_theme / total_papers (exact percentage)
|
| 708 |
+
coherence = mean pairwise cosine similarity of member embeddings
|
| 709 |
+
(1.0 = all identical, 0.0 = orthogonal)
|
| 710 |
+
|
| 711 |
+
Global metrics:
|
| 712 |
+
total_coverage = papers in at least one theme / total_papers
|
| 713 |
+
balance_ratio = largest_theme / smallest_theme
|
| 714 |
+
mean_coherence = average of per-theme coherence scores
|
| 715 |
+
|
| 716 |
+
Args:
|
| 717 |
+
themes_json_path: Path to themes.json.
|
| 718 |
+
embeddings_npy_path: Path to .emb.npy.
|
| 719 |
+
total_papers: Total papers in corpus (from Phase 1 stats).
|
| 720 |
+
|
| 721 |
+
Returns:
|
| 722 |
+
JSON: per-theme metrics + global metrics. needs_review=True.
|
| 723 |
+
"""
|
| 724 |
+
themes = json.loads(Path(themes_json_path).read_text())
|
| 725 |
+
embeddings = np.load(embeddings_npy_path)
|
| 726 |
+
|
| 727 |
+
def _theme_metrics(t: dict) -> dict:
|
| 728 |
+
"""Compute coverage and coherence for one theme."""
|
| 729 |
+
idx = t.get("indices", [])
|
| 730 |
+
size = len(idx)
|
| 731 |
+
vecs = embeddings[idx or [0]]
|
| 732 |
+
sim = cosine_similarity(vecs)
|
| 733 |
+
n = len(vecs)
|
| 734 |
+
coherence = float(
|
| 735 |
+
(sim.sum() - n) / max(n * (n - 1), 1)
|
| 736 |
+
)
|
| 737 |
+
return {
|
| 738 |
+
"theme_id": t.get("theme_id", 0),
|
| 739 |
+
"theme_label": t.get("theme_label", ""),
|
| 740 |
+
"papers": size,
|
| 741 |
+
"coverage_pct": round(size / max(total_papers, 1) * 100, 2),
|
| 742 |
+
"coherence": round(coherence, 4),
|
| 743 |
+
}
|
| 744 |
+
|
| 745 |
+
per_theme = list(map(_theme_metrics, themes))
|
| 746 |
+
|
| 747 |
+
all_paper_idx = set(sum(map(lambda t: t.get("indices", []), themes), []))
|
| 748 |
+
sizes = list(map(lambda m: m["papers"], per_theme))
|
| 749 |
+
coherences = list(map(lambda m: m["coherence"], per_theme))
|
| 750 |
+
|
| 751 |
+
global_metrics = {
|
| 752 |
+
"total_coverage_pct": round(len(all_paper_idx) / max(total_papers, 1) * 100, 2),
|
| 753 |
+
"balance_ratio": round(max(sizes, default=1) / max(min(sizes, default=1), 1), 2),
|
| 754 |
+
"mean_coherence": round(sum(coherences) / max(len(coherences), 1), 4),
|
| 755 |
+
"theme_count": len(themes),
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
+
out_path = Path(themes_json_path).parent / "saturation.json"
|
| 759 |
+
result = {"per_theme": per_theme, "global": global_metrics}
|
| 760 |
+
out_path.write_text(json.dumps(result, indent=2))
|
| 761 |
+
|
| 762 |
+
return json.dumps({
|
| 763 |
+
**global_metrics,
|
| 764 |
+
"per_theme": per_theme,
|
| 765 |
+
"saturation_json": str(out_path),
|
| 766 |
+
"needs_review": True,
|
| 767 |
+
}, indent=2)
|
| 768 |
+
|
| 769 |
+
|
| 770 |
+
@tool
|
| 771 |
+
def generate_theme_profiles(
|
| 772 |
+
themes_json_path: str,
|
| 773 |
+
embeddings_npy_path: str,
|
| 774 |
+
texts_parquet_path: str,
|
| 775 |
+
run_mode: str = "abstract",
|
| 776 |
+
) -> str:
|
| 777 |
+
"""Generate profile cards with top-5 nearest sentences per theme.
|
| 778 |
+
|
| 779 |
+
Phase 5 β Defining and Naming Themes. DETERMINISTIC.
|
| 780 |
+
|
| 781 |
+
For each theme centroid, computes cosine similarity against ALL
|
| 782 |
+
embeddings and returns the 5 closest sentences. These are the
|
| 783 |
+
REAL sentences from the corpus β not generated, not recalled
|
| 784 |
+
from conversation history. The reviewer uses these to decide
|
| 785 |
+
on final theme names.
|
| 786 |
+
|
| 787 |
+
Steps:
|
| 788 |
+
1. Load themes.json with centroids
|
| 789 |
+
2. Load full embedding matrix
|
| 790 |
+
3. Load original texts from parquet
|
| 791 |
+
4. For each theme: cosine_similarity(centroid, all_embeddings)
|
| 792 |
+
5. Take top 5 by similarity score
|
| 793 |
+
6. Return exact sentence text + similarity score
|
| 794 |
+
7. Save profiles.json
|
| 795 |
+
|
| 796 |
+
Args:
|
| 797 |
+
themes_json_path: Path to themes.json.
|
| 798 |
+
embeddings_npy_path: Path to .emb.npy.
|
| 799 |
+
texts_parquet_path: Path to .clean.parquet (for original text).
|
| 800 |
+
run_mode: 'abstract' or 'title'.
|
| 801 |
+
|
| 802 |
+
Returns:
|
| 803 |
+
JSON: profiles list with top-5 sentences per theme. needs_review=True.
|
| 804 |
+
"""
|
| 805 |
+
themes = json.loads(Path(themes_json_path).read_text())
|
| 806 |
+
embeddings = np.load(embeddings_npy_path)
|
| 807 |
+
target = f"{RUN_CONFIGS[run_mode][0]}_clean"
|
| 808 |
+
texts = pd.read_parquet(texts_parquet_path)[target].tolist()
|
| 809 |
+
|
| 810 |
+
def _profile(t: dict) -> dict:
|
| 811 |
+
"""Build a profile card for one theme: centroid β top 5 nearest."""
|
| 812 |
+
centroid = np.array(t["centroid"]).reshape(1, -1)
|
| 813 |
+
sims = cosine_similarity(centroid, embeddings)[0]
|
| 814 |
+
top5_idx = np.argsort(sims)[::-1][:5].tolist()
|
| 815 |
+
top5 = list(map(
|
| 816 |
+
lambda i: {
|
| 817 |
+
"sentence_idx": i,
|
| 818 |
+
"text": texts[i][:300],
|
| 819 |
+
"similarity": round(float(sims[i]), 4),
|
| 820 |
+
},
|
| 821 |
+
top5_idx,
|
| 822 |
+
))
|
| 823 |
+
return {
|
| 824 |
+
"theme_id": t.get("theme_id", 0),
|
| 825 |
+
"theme_label": t.get("theme_label", ""),
|
| 826 |
+
"total_papers": t.get("total_papers", 0),
|
| 827 |
+
"top_5_sentences": top5,
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
profiles = list(map(_profile, themes))
|
| 831 |
+
out_path = Path(themes_json_path).parent / "profiles.json"
|
| 832 |
+
out_path.write_text(json.dumps(profiles, indent=2))
|
| 833 |
+
|
| 834 |
+
return json.dumps({
|
| 835 |
+
"profiles_count": len(profiles),
|
| 836 |
+
"profiles_json": str(out_path),
|
| 837 |
+
"profiles": profiles,
|
| 838 |
+
"needs_review": True,
|
| 839 |
+
}, indent=2)
|
| 840 |
+
|
| 841 |
+
|
| 842 |
+
@tool
|
| 843 |
+
def compare_with_taxonomy(themes_json_path: str) -> str:
|
| 844 |
+
"""Map each theme to PAJAIS 25 IS research categories via Mistral.
|
| 845 |
+
|
| 846 |
+
Phase 5.5 β Taxonomy Alignment (extension). LLM-DEPENDENT.
|
| 847 |
+
|
| 848 |
+
Themes with alignment_score < 0.50 are flagged as potentially NOVEL.
|
| 849 |
+
|
| 850 |
+
Args:
|
| 851 |
+
themes_json_path: Path to themes.json.
|
| 852 |
+
|
| 853 |
+
Returns:
|
| 854 |
+
JSON: themes_aligned count + taxonomy_file path. needs_review=True.
|
| 855 |
+
"""
|
| 856 |
+
themes = json.loads(Path(themes_json_path).read_text())
|
| 857 |
+
|
| 858 |
+
safe_themes = list(map(
|
| 859 |
+
lambda t: {k: v for k, v in t.items() if k not in ("centroid", "indices")},
|
| 860 |
+
themes,
|
| 861 |
+
))
|
| 862 |
+
|
| 863 |
+
template = (
|
| 864 |
+
"You are an IS research taxonomy expert.\n\n"
|
| 865 |
+
"PAJAIS 25 Categories:\n{pajais}\n\n"
|
| 866 |
+
"Research themes:\n{themes}\n\n"
|
| 867 |
+
"For EACH theme return a JSON array where every element has:\n"
|
| 868 |
+
" theme_label : string\n"
|
| 869 |
+
" pajais_categories : list of 1-3 matching PAJAIS category names\n"
|
| 870 |
+
" alignment_score : float 0.0-1.0\n"
|
| 871 |
+
" notes : one sentence justification\n\n"
|
| 872 |
+
"Return ONLY the JSON array β no markdown, no preamble."
|
| 873 |
+
)
|
| 874 |
+
|
| 875 |
+
result = _mistral_chain(template).invoke({
|
| 876 |
+
"pajais": "\n".join(map(lambda c: f"- {c}", PAJAIS_25)),
|
| 877 |
+
"themes": json.dumps(safe_themes, indent=2),
|
| 878 |
+
})
|
| 879 |
+
out_path = Path(themes_json_path).parent / "taxonomy_alignment.json"
|
| 880 |
+
out_path.write_text(json.dumps(result, indent=2))
|
| 881 |
+
|
| 882 |
+
return json.dumps({
|
| 883 |
+
"themes_aligned": len(result),
|
| 884 |
+
"taxonomy_file": str(out_path),
|
| 885 |
+
"needs_review": True,
|
| 886 |
+
}, indent=2)
|
| 887 |
+
|
| 888 |
+
|
| 889 |
+
@tool
|
| 890 |
+
def generate_comparison_csv(
|
| 891 |
+
abstract_themes_path: str,
|
| 892 |
+
title_themes_path: str,
|
| 893 |
+
taxonomy_abstract_path: str,
|
| 894 |
+
taxonomy_title_path: str,
|
| 895 |
+
) -> str:
|
| 896 |
+
"""Build side-by-side abstract vs title comparison CSV.
|
| 897 |
+
|
| 898 |
+
Phase 6 β Report. DETERMINISTIC.
|
| 899 |
+
|
| 900 |
+
Joins on PAJAIS_Category. Delta_Score = Abstract - Title.
|
| 901 |
+
|
| 902 |
+
Args:
|
| 903 |
+
abstract_themes_path: themes.json β abstract run.
|
| 904 |
+
title_themes_path: themes.json β title run.
|
| 905 |
+
taxonomy_abstract_path: taxonomy_alignment.json β abstract run.
|
| 906 |
+
taxonomy_title_path: taxonomy_alignment.json β title run.
|
| 907 |
+
|
| 908 |
+
Returns:
|
| 909 |
+
JSON: comparison_csv path, total_rows, columns. needs_review=True.
|
| 910 |
+
"""
|
| 911 |
+
def _explode_taxonomy(path: str) -> pd.DataFrame:
|
| 912 |
+
"""Flatten taxonomy alignment into one row per PAJAIS category."""
|
| 913 |
+
data = json.loads(Path(path).read_text())
|
| 914 |
+
rows = sum(
|
| 915 |
+
list(map(
|
| 916 |
+
lambda item: list(map(
|
| 917 |
+
lambda cat: {
|
| 918 |
+
"pajais_category": cat,
|
| 919 |
+
"theme_label": item.get("theme_label", ""),
|
| 920 |
+
"alignment_score": item.get("alignment_score", 0.0),
|
| 921 |
+
},
|
| 922 |
+
item.get("pajais_categories", []),
|
| 923 |
+
)),
|
| 924 |
+
data,
|
| 925 |
+
)),
|
| 926 |
+
[],
|
| 927 |
+
)
|
| 928 |
+
return pd.DataFrame(rows)
|
| 929 |
+
|
| 930 |
+
df_abs = _explode_taxonomy(taxonomy_abstract_path)
|
| 931 |
+
df_title = _explode_taxonomy(taxonomy_title_path)
|
| 932 |
+
|
| 933 |
+
df_abs.columns = ["PAJAIS_Category", "Abstract_Theme", "Abstract_Score"]
|
| 934 |
+
df_title.columns = ["PAJAIS_Category", "Title_Theme", "Title_Score"]
|
| 935 |
+
|
| 936 |
+
merged = (
|
| 937 |
+
pd.merge(df_abs, df_title, on="PAJAIS_Category", how="outer")
|
| 938 |
+
.fillna({"Abstract_Score": 0.0, "Title_Score": 0.0,
|
| 939 |
+
"Abstract_Theme": "", "Title_Theme": ""})
|
| 940 |
+
.assign(Delta_Score=lambda d: (d["Abstract_Score"] - d["Title_Score"]).round(4))
|
| 941 |
+
.sort_values("PAJAIS_Category")
|
| 942 |
+
.reset_index(drop=True)
|
| 943 |
+
)
|
| 944 |
+
|
| 945 |
+
out_csv = Path(abstract_themes_path).parent / "abstract_vs_title_comparison.csv"
|
| 946 |
+
merged.to_csv(out_csv, index=False)
|
| 947 |
+
|
| 948 |
+
return json.dumps({
|
| 949 |
+
"comparison_csv": str(out_csv),
|
| 950 |
+
"total_rows": len(merged),
|
| 951 |
+
"columns": list(merged.columns),
|
| 952 |
+
"needs_review": True,
|
| 953 |
+
}, indent=2)
|
| 954 |
+
|
| 955 |
+
|
| 956 |
+
@tool
|
| 957 |
+
def export_narrative(
|
| 958 |
+
taxonomy_alignment_path: str,
|
| 959 |
+
comparison_csv_path: str,
|
| 960 |
+
run_mode: str = "abstract",
|
| 961 |
+
) -> str:
|
| 962 |
+
"""Generate 500-word Section 7: Discussion & Implications via Mistral.
|
| 963 |
+
|
| 964 |
+
Phase 6 β Report. LLM-DEPENDENT (grounded in taxonomy + comparison data).
|
| 965 |
+
|
| 966 |
+
Args:
|
| 967 |
+
taxonomy_alignment_path: Path to taxonomy_alignment.json.
|
| 968 |
+
comparison_csv_path: Path to comparison CSV.
|
| 969 |
+
run_mode: 'abstract' or 'title'.
|
| 970 |
+
|
| 971 |
+
Returns:
|
| 972 |
+
JSON: narrative_path, word_count, narrative text. needs_review=True.
|
| 973 |
+
"""
|
| 974 |
+
alignment = json.loads(Path(taxonomy_alignment_path).read_text())
|
| 975 |
+
|
| 976 |
+
top_delta = (
|
| 977 |
+
pd.read_csv(comparison_csv_path)
|
| 978 |
+
.assign(_abs=lambda d: d["Delta_Score"].abs())
|
| 979 |
+
.sort_values("_abs", ascending=False)
|
| 980 |
+
.drop(columns=["_abs"])
|
| 981 |
+
.head(5)
|
| 982 |
+
)
|
| 983 |
+
|
| 984 |
+
template = (
|
| 985 |
+
"You are a senior IS researcher writing a systematic literature review.\n\n"
|
| 986 |
+
"Write Section 7: Discussion & Implications in exactly {word_count} words.\n\n"
|
| 987 |
+
"Run mode: {run_mode}\n\n"
|
| 988 |
+
"Taxonomy alignment (top 10):\n{alignment}\n\n"
|
| 989 |
+
"Top 5 divergent PAJAIS categories (abstract vs title):\n{divergence}\n\n"
|
| 990 |
+
"Requirements:\n"
|
| 991 |
+
"1. Discuss dominant themes and PAJAIS alignment.\n"
|
| 992 |
+
"2. Interpret divergence between abstract- and title-based models.\n"
|
| 993 |
+
"3. Highlight implications for IS research practice and future agenda.\n"
|
| 994 |
+
"4. Use formal academic register β no bullet points.\n"
|
| 995 |
+
"5. Return a JSON object with a single key 'narrative' containing the prose.\n\n"
|
| 996 |
+
"Return ONLY valid JSON."
|
| 997 |
+
)
|
| 998 |
+
|
| 999 |
+
result = _mistral_chain(template).invoke({
|
| 1000 |
+
"word_count": NARRATIVE_WORDS,
|
| 1001 |
+
"run_mode": run_mode,
|
| 1002 |
+
"alignment": json.dumps(alignment[:10], indent=2),
|
| 1003 |
+
"divergence": top_delta.to_json(orient="records", indent=2),
|
| 1004 |
+
})
|
| 1005 |
+
narrative_text = result.get("narrative", str(result))
|
| 1006 |
+
out_path = Path(taxonomy_alignment_path).parent / "narrative.md"
|
| 1007 |
+
out_path.write_text(
|
| 1008 |
+
f"## Section 7: Discussion & Implications\n\n{narrative_text}\n",
|
| 1009 |
+
encoding="utf-8",
|
| 1010 |
+
)
|
| 1011 |
+
|
| 1012 |
+
return json.dumps({
|
| 1013 |
+
"narrative_path": str(out_path),
|
| 1014 |
+
"word_count": len(narrative_text.split()),
|
| 1015 |
+
"narrative": narrative_text,
|
| 1016 |
+
"needs_review": True,
|
| 1017 |
+
}, indent=2)
|
| 1018 |
+
|
| 1019 |
+
|
| 1020 |
+
ALL_TOOLS = [
|
| 1021 |
+
load_scopus_csv,
|
| 1022 |
+
run_bertopic_discovery,
|
| 1023 |
+
label_topics_with_llm,
|
| 1024 |
+
reassign_sentences,
|
| 1025 |
+
consolidate_into_themes,
|
| 1026 |
+
compute_saturation,
|
| 1027 |
+
generate_theme_profiles,
|
| 1028 |
+
compare_with_taxonomy,
|
| 1029 |
+
generate_comparison_csv,
|
| 1030 |
+
export_narrative,
|
| 1031 |
+
]
|