Spaces:
Sleeping
Sleeping
Initial commit
Browse files- agent.py +205 -0
- app.py +461 -0
- requirements.txt +26 -0
- tools.py +443 -0
agent.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
agent.py β LangGraph ReAct agent for Braun & Clarke (2006) thematic analysis.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
from langgraph.prebuilt import create_react_agent
|
| 8 |
+
from langgraph.checkpoint.memory import MemorySaver
|
| 9 |
+
from langchain_mistralai import ChatMistralAI
|
| 10 |
+
|
| 11 |
+
from tools import (
|
| 12 |
+
load_scopus_csv,
|
| 13 |
+
run_bertopic_discovery,
|
| 14 |
+
label_topics_with_llm,
|
| 15 |
+
consolidate_into_themes,
|
| 16 |
+
compare_with_taxonomy,
|
| 17 |
+
generate_comparison_csv,
|
| 18 |
+
export_narrative,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
# System prompt
|
| 23 |
+
# ---------------------------------------------------------------------------
|
| 24 |
+
|
| 25 |
+
SYSTEM_PROMPT = """
|
| 26 |
+
You are a computational thematic analysis expert specialising in Braun & Clarke (2006)
|
| 27 |
+
six-phase thematic analysis applied to systematic literature reviews. You work with
|
| 28 |
+
Scopus CSV exports and guide researchers through a rigorous, reproducible analysis
|
| 29 |
+
pipeline using BERTopic clustering and LLM-assisted labelling.
|
| 30 |
+
|
| 31 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
+
ROLE
|
| 33 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
+
- Expert in qualitative and computational thematic analysis
|
| 35 |
+
- Familiar with PAJAIS (25 AI research categories) taxonomy
|
| 36 |
+
- Methodologically rigorous: one phase per message, no skipping
|
| 37 |
+
- You EXPLAIN what you did, what you found, and what the researcher should do next
|
| 38 |
+
- You never proceed to the next phase without explicit user approval via the review table
|
| 39 |
+
|
| 40 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
+
CRITICAL RULES
|
| 42 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
1. Complete EXACTLY ONE phase per conversational turn, then STOP and wait.
|
| 44 |
+
2. ALL topic approvals, renames, and groupings happen via the REVIEW TABLE β never via chat.
|
| 45 |
+
3. Never ask the user to type topic labels or approvals into the chat.
|
| 46 |
+
4. After every phase, output a clear STOP GATE message telling the user what to review.
|
| 47 |
+
5. You must call the appropriate tool for each phase β do NOT fabricate results.
|
| 48 |
+
6. Always report tool outputs clearly: total papers, sentences, clusters, themes.
|
| 49 |
+
7. When showing the review table, list all columns: #, Topic Label, Top Evidence,
|
| 50 |
+
Sentences, Papers, Approve (Yes/No), Rename To, Reasoning.
|
| 51 |
+
8. Progress is tracked in the phase progress bar β reference the current phase by name.
|
| 52 |
+
|
| 53 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 54 |
+
AVAILABLE TOOLS
|
| 55 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
1. load_scopus_csv β Load CSV, count papers/sentences, apply boilerplate filter
|
| 57 |
+
2. run_bertopic_discovery β Embed + cluster sentences, find centroids, generate 4 charts
|
| 58 |
+
3. label_topics_with_llm β Send top-100 topics to Mistral for human-readable labels
|
| 59 |
+
4. consolidate_into_themesβ Merge approved topic groups into named themes, recompute centroids
|
| 60 |
+
5. compare_with_taxonomy β Map final themes to PAJAIS 25 categories
|
| 61 |
+
6. generate_comparison_csvβ Abstract vs title side-by-side CSV export
|
| 62 |
+
7. export_narrative β Generate ~500-word Section 7 narrative via Mistral
|
| 63 |
+
|
| 64 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
BRAUN & CLARKE (2006) β SIX PHASES
|
| 66 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
|
| 68 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 69 |
+
PHASE 1 β Familiarisation with the Data
|
| 70 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 71 |
+
Steps:
|
| 72 |
+
1. Call load_scopus_csv with the uploaded CSV path and run_config="abstract".
|
| 73 |
+
2. Report: total papers, total sentences after boilerplate filtering, columns used.
|
| 74 |
+
3. Show a brief sample of 3β5 cleaned abstracts.
|
| 75 |
+
4. Explain what boilerplate was removed and why.
|
| 76 |
+
5. Confirm the dataset is ready for initial coding.
|
| 77 |
+
|
| 78 |
+
β STOP GATE 1: After reporting statistics, STOP. Tell the user:
|
| 79 |
+
"Phase 1 complete. Please review the dataset statistics above. When ready,
|
| 80 |
+
type 'proceed to Phase 2' to begin BERTopic clustering."
|
| 81 |
+
|
| 82 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 83 |
+
PHASE 2 β Generating Initial Codes
|
| 84 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 85 |
+
Steps:
|
| 86 |
+
1. Call run_bertopic_discovery on the cleaned parquet file.
|
| 87 |
+
2. Call label_topics_with_llm to generate human-readable labels for top-100 clusters.
|
| 88 |
+
3. Populate the REVIEW TABLE with all labelled topics (columns: #, Topic Label,
|
| 89 |
+
Top Evidence, Sentences, Papers, Approve, Rename To, Reasoning).
|
| 90 |
+
4. Explain the clustering method (all-MiniLM-L6-v2 + AgglomerativeClustering cosine 0.7).
|
| 91 |
+
5. Show the 4 generated charts in the Charts tab.
|
| 92 |
+
|
| 93 |
+
β STOP GATE 2: After displaying the review table, STOP. Tell the user:
|
| 94 |
+
"Phase 2 complete. Please review the 100 topics in the Review Table.
|
| 95 |
+
For each topic: set Approve=Yes/No, optionally fill Rename To and Reasoning.
|
| 96 |
+
Group related topics by noting the same new label. When done, click 'Submit Review'."
|
| 97 |
+
DO NOT proceed until Submit Review is clicked.
|
| 98 |
+
|
| 99 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 100 |
+
PHASE 3 β Searching for Themes
|
| 101 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
+
Steps:
|
| 103 |
+
1. Parse the submitted review table to extract approved topics and their groupings.
|
| 104 |
+
2. Call consolidate_into_themes with the approved groups JSON.
|
| 105 |
+
3. Present the consolidated themes with: theme name, constituent topics, top sentences,
|
| 106 |
+
and sentence count.
|
| 107 |
+
4. Explain how topics were merged and centroids recomputed.
|
| 108 |
+
|
| 109 |
+
β STOP GATE 3: After showing consolidated themes, STOP. Tell the user:
|
| 110 |
+
"Phase 3 complete. Please review the consolidated themes in the Review Table.
|
| 111 |
+
Approve, rename, or merge themes as needed. Click 'Submit Review' when done."
|
| 112 |
+
DO NOT proceed until Submit Review is clicked.
|
| 113 |
+
|
| 114 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 115 |
+
PHASE 4 β Reviewing Themes (Saturation Check)
|
| 116 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 117 |
+
Steps:
|
| 118 |
+
1. Compute coverage: what % of total sentences are captured by approved themes.
|
| 119 |
+
2. Identify any sentences/topics NOT covered by a theme (orphan codes).
|
| 120 |
+
3. Report saturation metrics: coverage %, orphan count, theme overlap.
|
| 121 |
+
4. Suggest whether any orphan codes warrant a new theme or should be discarded.
|
| 122 |
+
5. Update the review table with coverage statistics per theme.
|
| 123 |
+
|
| 124 |
+
β STOP GATE 4: After reporting saturation, STOP. Tell the user:
|
| 125 |
+
"Phase 4 complete. Coverage is [X]%. Please review the saturation report.
|
| 126 |
+
Adjust theme groupings in the Review Table if needed. Click 'Submit Review'
|
| 127 |
+
to confirm final themes."
|
| 128 |
+
DO NOT proceed until Submit Review is clicked.
|
| 129 |
+
|
| 130 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 131 |
+
PHASE 5 β Defining and Naming Themes
|
| 132 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 133 |
+
Steps:
|
| 134 |
+
1. For each confirmed theme, generate: a definitive name, a 2-sentence definition,
|
| 135 |
+
and 3 exemplary quotes from the data.
|
| 136 |
+
2. Explain how the name captures the essence of the theme.
|
| 137 |
+
3. Ensure theme names are analytic (not merely descriptive).
|
| 138 |
+
4. Present the finalised theme map.
|
| 139 |
+
|
| 140 |
+
β STOP GATE 5 (implicit): Present the final theme map and ask:
|
| 141 |
+
"Phase 5 complete. Please confirm the final theme names and definitions above.
|
| 142 |
+
When satisfied, type 'proceed to PAJAIS mapping'."
|
| 143 |
+
|
| 144 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 145 |
+
PHASE 5.5 β PAJAIS Taxonomy Mapping
|
| 146 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 147 |
+
Steps:
|
| 148 |
+
1. Call compare_with_taxonomy to map each theme to PAJAIS 25 categories.
|
| 149 |
+
2. Present a mapping table: Theme β PAJAIS Category, Confidence, Rationale.
|
| 150 |
+
3. Highlight any themes that map to multiple categories (ambiguous cases).
|
| 151 |
+
|
| 152 |
+
β STOP GATE 5.5: After presenting the mapping, STOP. Tell the user:
|
| 153 |
+
"PAJAIS mapping complete. Please review the taxonomy mappings in the Review Table.
|
| 154 |
+
Adjust any incorrect mappings. Click 'Submit Review' to confirm."
|
| 155 |
+
DO NOT proceed until Submit Review is clicked.
|
| 156 |
+
|
| 157 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 158 |
+
PHASE 6 β Producing the Report
|
| 159 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 160 |
+
Steps:
|
| 161 |
+
1. Call generate_comparison_csv to produce the abstract vs title comparison.
|
| 162 |
+
2. Call export_narrative to generate the ~500-word Section 7 discussion.
|
| 163 |
+
3. Present the narrative inline and confirm all files are ready for download.
|
| 164 |
+
4. List all downloadable outputs: comparison CSV, narrative.md, topics.json,
|
| 165 |
+
themes.json, taxonomy_mapping.json, charts.
|
| 166 |
+
5. Congratulate the researcher and summarise the full analysis pipeline.
|
| 167 |
+
|
| 168 |
+
No STOP GATE β Phase 6 is the final deliverable.
|
| 169 |
+
|
| 170 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 171 |
+
OUTPUT FORMAT GUIDELINES
|
| 172 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 173 |
+
- Always start your response with: **Phase X β [Phase Name]** and the progress %.
|
| 174 |
+
- Use markdown tables for review tables.
|
| 175 |
+
- Use code blocks for JSON snippets.
|
| 176 |
+
- End every non-final phase with a clearly marked β STOP message.
|
| 177 |
+
- When referencing tool outputs, always show the key numbers (papers, sentences, clusters).
|
| 178 |
+
"""
|
| 179 |
+
|
| 180 |
+
# ---------------------------------------------------------------------------
|
| 181 |
+
# Agent construction
|
| 182 |
+
# ---------------------------------------------------------------------------
|
| 183 |
+
|
| 184 |
+
_llm = ChatMistralAI(model="mistral-large-latest", temperature=0)
|
| 185 |
+
|
| 186 |
+
_tools = [
|
| 187 |
+
load_scopus_csv,
|
| 188 |
+
run_bertopic_discovery,
|
| 189 |
+
label_topics_with_llm,
|
| 190 |
+
consolidate_into_themes,
|
| 191 |
+
compare_with_taxonomy,
|
| 192 |
+
generate_comparison_csv,
|
| 193 |
+
export_narrative,
|
| 194 |
+
]
|
| 195 |
+
|
| 196 |
+
_memory = MemorySaver()
|
| 197 |
+
|
| 198 |
+
agent = create_react_agent(
|
| 199 |
+
model=_llm,
|
| 200 |
+
tools=_tools,
|
| 201 |
+
checkpointer=_memory,
|
| 202 |
+
prompt=SYSTEM_PROMPT,
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
__all__ = ["agent", "SYSTEM_PROMPT"]
|
app.py
ADDED
|
@@ -0,0 +1,461 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py β Gradio Blocks UI for the BERTopic Thematic Analysis Agent.
|
| 3 |
+
Sections: (1) Data Input, (2) Agent Conversation, (3) Results
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import uuid
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
import gradio as gr
|
| 14 |
+
import pandas as pd
|
| 15 |
+
import plotly.io as pio
|
| 16 |
+
|
| 17 |
+
from agent import agent
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# Constants
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
THREAD_ID = str(uuid.uuid4())
|
| 24 |
+
AGENT_CONFIG = {
|
| 25 |
+
"configurable": {"thread_id": THREAD_ID},
|
| 26 |
+
"recursion_limit": 100,
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
REVIEW_COLUMNS = [
|
| 30 |
+
"#",
|
| 31 |
+
"Topic Label",
|
| 32 |
+
"Top Evidence",
|
| 33 |
+
"Sentences",
|
| 34 |
+
"Papers",
|
| 35 |
+
"Approve",
|
| 36 |
+
"Rename To",
|
| 37 |
+
"Reasoning",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
PHASE_LABELS = [
|
| 41 |
+
("Phase 1", "Familiarisation"),
|
| 42 |
+
("Phase 2", "Initial Codes"),
|
| 43 |
+
("Phase 3", "Themes"),
|
| 44 |
+
("Phase 4", "Saturation"),
|
| 45 |
+
("Phase 5", "Naming"),
|
| 46 |
+
("Phase 5.5", "PAJAIS"),
|
| 47 |
+
("Phase 6", "Report"),
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
CHART_OPTIONS = [
|
| 51 |
+
"Bar β Top 20 Topics",
|
| 52 |
+
"Treemap β Topic Distribution",
|
| 53 |
+
"Scatter β Cluster PCA",
|
| 54 |
+
"Heatmap β Topic Similarity",
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
_CHART_KEYS = ["bar_top20", "treemap", "scatter_pca", "heatmap"]
|
| 58 |
+
|
| 59 |
+
# ---------------------------------------------------------------------------
|
| 60 |
+
# Helpers
|
| 61 |
+
# ---------------------------------------------------------------------------
|
| 62 |
+
|
| 63 |
+
def _phase_bar_html(active_index: int) -> str:
|
| 64 |
+
steps_html = ""
|
| 65 |
+
for i, (code, name) in enumerate(PHASE_LABELS):
|
| 66 |
+
if i < active_index:
|
| 67 |
+
state, bg, fg = "done", "#10b981", "#ffffff"
|
| 68 |
+
elif i == active_index:
|
| 69 |
+
state, bg, fg = "active", "#6366f1", "#ffffff"
|
| 70 |
+
else:
|
| 71 |
+
state, bg, fg = "pending", "#e5e7eb", "#6b7280"
|
| 72 |
+
steps_html += (
|
| 73 |
+
f'<div style="display:flex;flex-direction:column;align-items:center;gap:4px;flex:1;">'
|
| 74 |
+
f'<div style="width:32px;height:32px;border-radius:50%;background:{bg};'
|
| 75 |
+
f'color:{fg};display:flex;align-items:center;justify-content:center;'
|
| 76 |
+
f'font-size:11px;font-weight:600;">{i+1}</div>'
|
| 77 |
+
f'<span style="font-size:10px;color:#374151;text-align:center;line-height:1.2;">'
|
| 78 |
+
f'{code}<br>{name}</span>'
|
| 79 |
+
f'</div>'
|
| 80 |
+
)
|
| 81 |
+
if i < len(PHASE_LABELS) - 1:
|
| 82 |
+
line_bg = "#10b981" if i < active_index else "#e5e7eb"
|
| 83 |
+
steps_html += (
|
| 84 |
+
f'<div style="flex:1;height:2px;background:{line_bg};margin-top:16px;'
|
| 85 |
+
f'max-width:40px;"></div>'
|
| 86 |
+
)
|
| 87 |
+
return (
|
| 88 |
+
f'<div style="padding:16px 8px;background:#f9fafb;border-radius:12px;'
|
| 89 |
+
f'border:1px solid #e5e7eb;margin-bottom:8px;">'
|
| 90 |
+
f'<div style="display:flex;align-items:flex-start;justify-content:space-between;">'
|
| 91 |
+
f'{steps_html}</div></div>'
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _empty_review_df() -> pd.DataFrame:
|
| 96 |
+
return pd.DataFrame(columns=REVIEW_COLUMNS)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _load_charts() -> dict:
|
| 100 |
+
p = Path("charts.json")
|
| 101 |
+
return json.loads(p.read_text()) if p.exists() else {}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _call_agent(message: str, history: list):
|
| 105 |
+
result = agent.invoke(
|
| 106 |
+
{"messages": [{"role": "user", "content": message}]},
|
| 107 |
+
config=AGENT_CONFIG,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
ai_msg = result["messages"][-1].content
|
| 111 |
+
|
| 112 |
+
updated_history = history + [
|
| 113 |
+
{"role": "user", "content": message},
|
| 114 |
+
{"role": "assistant", "content": ai_msg},
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
return updated_history, ""
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def _submit_review(
|
| 121 |
+
review_df: pd.DataFrame,
|
| 122 |
+
history: list,
|
| 123 |
+
) -> tuple[list, str, pd.DataFrame]:
|
| 124 |
+
"""Read table edits, serialise to JSON, send to agent."""
|
| 125 |
+
|
| 126 |
+
approved = review_df[
|
| 127 |
+
review_df["Approve"].astype(str).str.lower() == "yes"
|
| 128 |
+
] if not review_df.empty else review_df
|
| 129 |
+
|
| 130 |
+
groups = {}
|
| 131 |
+
|
| 132 |
+
for _, row in approved.iterrows():
|
| 133 |
+
theme_name = str(
|
| 134 |
+
row.get("Rename To")
|
| 135 |
+
or row.get("Topic Label")
|
| 136 |
+
or f"Theme_{row['#']}"
|
| 137 |
+
)
|
| 138 |
+
topic_id = int(row["#"]) if str(row["#"]).isdigit() else 0
|
| 139 |
+
groups.setdefault(theme_name, []).append(topic_id)
|
| 140 |
+
|
| 141 |
+
groups_list = [
|
| 142 |
+
{"theme_name": k, "topic_ids": v}
|
| 143 |
+
for k, v in groups.items()
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
summary = (
|
| 147 |
+
f"Review submitted. Approved topics: {len(approved)}.\n"
|
| 148 |
+
f"Groups formed: {len(groups_list)}.\n\n"
|
| 149 |
+
f"{json.dumps(groups_list, indent=2)}\n\n"
|
| 150 |
+
f"Please consolidate these groups into themes."
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
updated_history, _ = _call_agent(summary, history)
|
| 154 |
+
refreshed = _refresh_review_table()
|
| 155 |
+
return updated_history, "", refreshed
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def _upload_csv(file_obj):
|
| 159 |
+
if file_obj is None:
|
| 160 |
+
return "", "No file uploaded."
|
| 161 |
+
|
| 162 |
+
# π₯ CLEAR OLD FILES
|
| 163 |
+
files_to_clear = [
|
| 164 |
+
"labelled_topics.json",
|
| 165 |
+
"summaries.json",
|
| 166 |
+
"taxonomy_mapping.json",
|
| 167 |
+
"comparison.csv",
|
| 168 |
+
"report.txt"
|
| 169 |
+
]
|
| 170 |
+
|
| 171 |
+
list(map(lambda f: os.remove(f) if os.path.exists(f) else None, files_to_clear))
|
| 172 |
+
|
| 173 |
+
path = file_obj.name
|
| 174 |
+
return path, f"β
File ready: `{path}`"
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def _start_analysis(csv_path: str, history: list) -> tuple[list, str, str, pd.DataFrame]:
|
| 178 |
+
if not csv_path:
|
| 179 |
+
return history, "", "β οΈ Please upload a CSV first.", _empty_review_df()
|
| 180 |
+
msg = (
|
| 181 |
+
f"I have uploaded a Scopus CSV at: {csv_path}\n"
|
| 182 |
+
f"Please begin Phase 1 β Familiarisation. Load the CSV, report statistics, "
|
| 183 |
+
f"and STOP after Phase 1."
|
| 184 |
+
)
|
| 185 |
+
updated_history, _ = _call_agent(msg, history)
|
| 186 |
+
phase_html = _phase_bar_html(0)
|
| 187 |
+
return updated_history, "", phase_html, _empty_review_df()
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def _send_message(user_msg: str, history: list, phase_html: str) -> tuple[list, str, str, pd.DataFrame]:
|
| 191 |
+
if not user_msg.strip():
|
| 192 |
+
return history, "", phase_html, _refresh_review_table()
|
| 193 |
+
updated_history, _ = _call_agent(user_msg, history)
|
| 194 |
+
last_ai = updated_history[-1]["content"] if updated_history else ""
|
| 195 |
+
new_phase = _detect_phase(last_ai, phase_html)
|
| 196 |
+
refreshed = _refresh_review_table()
|
| 197 |
+
return updated_history, "", new_phase, refreshed
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def _detect_phase(ai_text: str, current_html: str) -> str:
|
| 201 |
+
phase_map = {
|
| 202 |
+
"phase 1": 0, "phase 2": 1, "phase 3": 2,
|
| 203 |
+
"phase 4": 3, "phase 5.5": 5, "phase 5": 4, "phase 6": 6,
|
| 204 |
+
}
|
| 205 |
+
lower = ai_text.lower()
|
| 206 |
+
detected = current_html
|
| 207 |
+
for key, idx in sorted(phase_map.items(), key=lambda x: -len(x[0])):
|
| 208 |
+
if f"{key} complete" in lower or f"beginning {key}" in lower or f"starting {key}" in lower:
|
| 209 |
+
detected = _phase_bar_html(idx)
|
| 210 |
+
break
|
| 211 |
+
return detected
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def _get_chart_plot(chart_name: str):
|
| 215 |
+
charts = _load_charts()
|
| 216 |
+
key_map = dict(zip(CHART_OPTIONS, _CHART_KEYS))
|
| 217 |
+
key = key_map.get(chart_name, "")
|
| 218 |
+
payload = charts.get(key, "")
|
| 219 |
+
if not payload or str(payload).lstrip().startswith("<"):
|
| 220 |
+
return None
|
| 221 |
+
return pio.from_json(payload)
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def _get_download_files() -> list[str]:
|
| 225 |
+
candidates = [
|
| 226 |
+
"comparison_abstract_vs_title.csv",
|
| 227 |
+
"narrative.md",
|
| 228 |
+
"topics.json",
|
| 229 |
+
"labelled_topics.json",
|
| 230 |
+
"themes.json",
|
| 231 |
+
"taxonomy_mapping.json",
|
| 232 |
+
"summaries.json",
|
| 233 |
+
]
|
| 234 |
+
return list(filter(lambda p: Path(p).exists(), candidates))
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def _refresh_review_table() -> pd.DataFrame:
|
| 238 |
+
themes_path = Path("themes.json")
|
| 239 |
+
if themes_path.exists():
|
| 240 |
+
themes = json.loads(themes_path.read_text())
|
| 241 |
+
rows = list(map(
|
| 242 |
+
lambda idx_theme: {
|
| 243 |
+
"#": idx_theme[0] + 1,
|
| 244 |
+
"Topic Label": idx_theme[1].get("theme_name", f"Theme {idx_theme[0] + 1}"),
|
| 245 |
+
"Top Evidence": " | ".join(idx_theme[1].get("top_sentences", [])[:2]),
|
| 246 |
+
"Sentences": len(idx_theme[1].get("top_sentences", [])),
|
| 247 |
+
"Papers": "",
|
| 248 |
+
"Approve": "Yes",
|
| 249 |
+
"Rename To": "",
|
| 250 |
+
"Reasoning": "",
|
| 251 |
+
},
|
| 252 |
+
list(enumerate(themes)),
|
| 253 |
+
))
|
| 254 |
+
return pd.DataFrame(rows)
|
| 255 |
+
|
| 256 |
+
topics_path = Path("labelled_topics.json")
|
| 257 |
+
if not topics_path.exists():
|
| 258 |
+
return _empty_review_df()
|
| 259 |
+
topics = json.loads(topics_path.read_text())
|
| 260 |
+
rows = list(map(
|
| 261 |
+
lambda t: {
|
| 262 |
+
"#": t["topic_id"],
|
| 263 |
+
"Topic Label": t.get("label", f"Topic {t['topic_id']}"),
|
| 264 |
+
"Top Evidence": " | ".join(t.get("top_sentences", [])[:2]),
|
| 265 |
+
"Sentences": t.get("sentence_count", 0),
|
| 266 |
+
"Papers": "",
|
| 267 |
+
"Approve": "Yes",
|
| 268 |
+
"Rename To": "",
|
| 269 |
+
"Reasoning": t.get("reasoning", ""),
|
| 270 |
+
},
|
| 271 |
+
topics[:100],
|
| 272 |
+
))
|
| 273 |
+
return pd.DataFrame(rows)
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def _refresh_downloads() -> list[str]:
|
| 277 |
+
return _get_download_files() or None
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
# ---------------------------------------------------------------------------
|
| 281 |
+
# Build UI
|
| 282 |
+
# ---------------------------------------------------------------------------
|
| 283 |
+
|
| 284 |
+
with gr.Blocks(
|
| 285 |
+
title="BERTopic Thematic Analysis Agent",
|
| 286 |
+
) as demo:
|
| 287 |
+
|
| 288 |
+
# ---- State ----
|
| 289 |
+
csv_path_state = gr.State("")
|
| 290 |
+
|
| 291 |
+
# ---- Header ----
|
| 292 |
+
gr.HTML(
|
| 293 |
+
'<div style="padding:24px 0 8px;">'
|
| 294 |
+
'<h1 style="font-size:1.6rem;font-weight:600;margin:0;color:#1e1b4b;">'
|
| 295 |
+
'π BERTopic Thematic Analysis Agent</h1>'
|
| 296 |
+
'<p style="color:#6b7280;margin:4px 0 0;font-size:0.95rem;">'
|
| 297 |
+
'Braun & Clarke (2006) Β· Six-Phase Pipeline Β· PAJAIS Taxonomy</p>'
|
| 298 |
+
'</div>'
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
# ---- Phase Progress Bar ----
|
| 302 |
+
phase_bar = gr.HTML(value=_phase_bar_html(-1), label="Phase Progress")
|
| 303 |
+
|
| 304 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 305 |
+
# SECTION 1 β Data Input
|
| 306 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 307 |
+
with gr.Group():
|
| 308 |
+
gr.Markdown("## 1 Β· Data Input")
|
| 309 |
+
with gr.Row():
|
| 310 |
+
with gr.Column(scale=2):
|
| 311 |
+
file_upload = gr.File(
|
| 312 |
+
label="Upload Scopus CSV",
|
| 313 |
+
file_types=[".csv"],
|
| 314 |
+
type="filepath",
|
| 315 |
+
)
|
| 316 |
+
file_status = gr.Markdown("_No file uploaded._")
|
| 317 |
+
with gr.Column(scale=1):
|
| 318 |
+
run_config = gr.Radio(
|
| 319 |
+
choices=["abstract", "title"],
|
| 320 |
+
value="abstract",
|
| 321 |
+
label="Run Config (field to cluster)",
|
| 322 |
+
)
|
| 323 |
+
start_btn = gr.Button("βΆ Start Analysis", variant="primary", size="lg")
|
| 324 |
+
|
| 325 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 326 |
+
# SECTION 2 β Agent Conversation
|
| 327 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 328 |
+
with gr.Group():
|
| 329 |
+
gr.Markdown("## 2 Β· Agent Conversation")
|
| 330 |
+
chatbot = gr.Chatbot(
|
| 331 |
+
label="Thematic Analysis Agent"
|
| 332 |
+
)
|
| 333 |
+
with gr.Row():
|
| 334 |
+
chat_input = gr.Textbox(
|
| 335 |
+
placeholder="Type a message or instruction⦠(e.g. 'proceed to Phase 2')",
|
| 336 |
+
label="",
|
| 337 |
+
scale=5,
|
| 338 |
+
show_label=False,
|
| 339 |
+
lines=1,
|
| 340 |
+
)
|
| 341 |
+
send_btn = gr.Button("Send", variant="primary", scale=1)
|
| 342 |
+
|
| 343 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 344 |
+
# SECTION 3 β Results
|
| 345 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 346 |
+
with gr.Group():
|
| 347 |
+
gr.Markdown("## 3 Β· Results")
|
| 348 |
+
with gr.Tabs():
|
| 349 |
+
|
| 350 |
+
# --- Tab 1: Review Table ---
|
| 351 |
+
with gr.TabItem("π Review Table"):
|
| 352 |
+
with gr.Row():
|
| 353 |
+
refresh_table_btn = gr.Button("π Refresh Table", size="sm")
|
| 354 |
+
review_table = gr.Dataframe(
|
| 355 |
+
value=_empty_review_df(),
|
| 356 |
+
headers=REVIEW_COLUMNS,
|
| 357 |
+
datatype=[
|
| 358 |
+
"number", "str", "str", "number",
|
| 359 |
+
"str", "str", "str", "str",
|
| 360 |
+
],
|
| 361 |
+
column_count=(8, "fixed"),
|
| 362 |
+
interactive=True,
|
| 363 |
+
wrap=True,
|
| 364 |
+
label="Topic Review Table (edit Approve / Rename To / Reasoning)"
|
| 365 |
+
)
|
| 366 |
+
submit_review_btn = gr.Button(
|
| 367 |
+
"β
Submit Review", variant="primary", size="lg"
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
# --- Tab 2: Charts ---
|
| 371 |
+
with gr.TabItem("π Charts"):
|
| 372 |
+
chart_dropdown = gr.Dropdown(
|
| 373 |
+
choices=CHART_OPTIONS,
|
| 374 |
+
value=CHART_OPTIONS[0],
|
| 375 |
+
label="Select Chart",
|
| 376 |
+
interactive=True,
|
| 377 |
+
)
|
| 378 |
+
chart_display = gr.Plot(label="Chart")
|
| 379 |
+
|
| 380 |
+
# --- Tab 3: Download ---
|
| 381 |
+
with gr.TabItem("β¬ Download"):
|
| 382 |
+
refresh_dl_btn = gr.Button("π Refresh Files", size="sm")
|
| 383 |
+
download_files = gr.File(
|
| 384 |
+
label="Download Analysis Outputs",
|
| 385 |
+
file_count="multiple",
|
| 386 |
+
interactive=False,
|
| 387 |
+
value=None,
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 391 |
+
# Event wiring
|
| 392 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 393 |
+
|
| 394 |
+
# Upload CSV β store path
|
| 395 |
+
file_upload.change(
|
| 396 |
+
fn=_upload_csv,
|
| 397 |
+
inputs=[file_upload],
|
| 398 |
+
outputs=[csv_path_state, file_status],
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
# Start analysis button
|
| 402 |
+
start_btn.click(
|
| 403 |
+
fn=_start_analysis,
|
| 404 |
+
inputs=[csv_path_state, chatbot],
|
| 405 |
+
outputs=[chatbot, chat_input, phase_bar, review_table],
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
# Send message (button)
|
| 409 |
+
send_btn.click(
|
| 410 |
+
fn=_send_message,
|
| 411 |
+
inputs=[chat_input, chatbot, phase_bar],
|
| 412 |
+
outputs=[chatbot, chat_input, phase_bar, review_table],
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
# Send message (Enter key)
|
| 416 |
+
chat_input.submit(
|
| 417 |
+
fn=_send_message,
|
| 418 |
+
inputs=[chat_input, chatbot, phase_bar],
|
| 419 |
+
outputs=[chatbot, chat_input, phase_bar, review_table],
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
# Submit review table
|
| 423 |
+
submit_review_btn.click(
|
| 424 |
+
fn=_submit_review,
|
| 425 |
+
inputs=[review_table, chatbot],
|
| 426 |
+
outputs=[chatbot, chat_input, review_table],
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
# Refresh review table
|
| 430 |
+
refresh_table_btn.click(
|
| 431 |
+
fn=_refresh_review_table,
|
| 432 |
+
inputs=[],
|
| 433 |
+
outputs=[review_table],
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
# Chart dropdown
|
| 437 |
+
chart_dropdown.change(
|
| 438 |
+
fn=_get_chart_plot,
|
| 439 |
+
inputs=[chart_dropdown],
|
| 440 |
+
outputs=[chart_display],
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
# Refresh downloads
|
| 444 |
+
refresh_dl_btn.click(
|
| 445 |
+
fn=_refresh_downloads,
|
| 446 |
+
inputs=[],
|
| 447 |
+
outputs=[download_files],
|
| 448 |
+
)
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
# ---------------------------------------------------------------------------
|
| 452 |
+
# Launch
|
| 453 |
+
# ---------------------------------------------------------------------------
|
| 454 |
+
|
| 455 |
+
if __name__ == "__main__":
|
| 456 |
+
demo.launch(
|
| 457 |
+
server_name="0.0.0.0",
|
| 458 |
+
server_port=7860,
|
| 459 |
+
show_error=True,
|
| 460 |
+
theme=gr.themes.Soft(primary_hue="indigo"),
|
| 461 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core ML / NLP
|
| 2 |
+
sentence-transformers==3.3.1
|
| 3 |
+
scikit-learn==1.6.1
|
| 4 |
+
numpy==1.26.4
|
| 5 |
+
|
| 6 |
+
# LangChain / LangGraph
|
| 7 |
+
langchain==0.3.18
|
| 8 |
+
langchain-core==0.3.37
|
| 9 |
+
langchain-mistralai==0.2.4
|
| 10 |
+
langgraph==0.2.73
|
| 11 |
+
|
| 12 |
+
# Gradio UI
|
| 13 |
+
gradio==5.16.0
|
| 14 |
+
|
| 15 |
+
# Data handling
|
| 16 |
+
pandas==2.2.3
|
| 17 |
+
pyarrow==19.0.0
|
| 18 |
+
|
| 19 |
+
# Visualisation
|
| 20 |
+
plotly==5.24.1
|
| 21 |
+
|
| 22 |
+
# Mistral SDK (pulled by langchain-mistralai, pinned for stability)
|
| 23 |
+
mistralai==1.3.1
|
| 24 |
+
|
| 25 |
+
# Utilities
|
| 26 |
+
python-dotenv==1.0.1
|
tools.py
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tools.py β 7 LangChain tool functions for BERTopic thematic analysis pipeline.
|
| 3 |
+
Constraints: ZERO if/else, ZERO for/while, ZERO try/except.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import re
|
| 10 |
+
import numpy as np
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import plotly.express as px
|
| 13 |
+
import plotly.graph_objects as go
|
| 14 |
+
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from langchain_core.tools import tool
|
| 17 |
+
from sentence_transformers import SentenceTransformer
|
| 18 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 19 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 20 |
+
from langchain_core.prompts import PromptTemplate
|
| 21 |
+
from langchain_core.output_parsers import JsonOutputParser
|
| 22 |
+
from langchain_mistralai import ChatMistralAI
|
| 23 |
+
from dotenv import load_dotenv
|
| 24 |
+
load_dotenv() # add this right after the imports
|
| 25 |
+
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
# Constants
|
| 28 |
+
# ---------------------------------------------------------------------------
|
| 29 |
+
|
| 30 |
+
BOILERPLATE_PATTERNS = [
|
| 31 |
+
r"Β©\s*\d{4}",
|
| 32 |
+
r"all rights reserved",
|
| 33 |
+
r"published by elsevier",
|
| 34 |
+
r"doi:\s*10\.\S+",
|
| 35 |
+
r"this article is protected",
|
| 36 |
+
r"www\.\S+\.com",
|
| 37 |
+
r"^\s*abstract\s*$",
|
| 38 |
+
r"please cite this article",
|
| 39 |
+
r"accepted manuscript",
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
RUN_CONFIGS = {
|
| 43 |
+
"abstract": ["Abstract"],
|
| 44 |
+
"title": ["Title"],
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
PAJAIS_CATEGORIES = [
|
| 48 |
+
"Artificial Intelligence", "Machine Learning", "Deep Learning",
|
| 49 |
+
"Natural Language Processing", "Computer Vision", "Robotics",
|
| 50 |
+
"Knowledge Representation", "Expert Systems", "Decision Support",
|
| 51 |
+
"Data Mining", "Information Retrieval", "Human-Computer Interaction",
|
| 52 |
+
"Ethics in AI", "Explainable AI", "Fairness and Bias",
|
| 53 |
+
"AI in Healthcare", "AI in Education", "AI in Finance",
|
| 54 |
+
"AI in Manufacturing", "AI in Agriculture", "AI Governance",
|
| 55 |
+
"Neural Networks", "Reinforcement Learning", "Federated Learning",
|
| 56 |
+
"AI Safety",
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
_MISTRAL = ChatMistralAI(model="mistral-large-latest", temperature=0)
|
| 60 |
+
|
| 61 |
+
# ---------------------------------------------------------------------------
|
| 62 |
+
# Helper β pure functions, no loops
|
| 63 |
+
# ---------------------------------------------------------------------------
|
| 64 |
+
|
| 65 |
+
def _clean_text(text: str) -> str:
|
| 66 |
+
combined = "|".join(BOILERPLATE_PATTERNS)
|
| 67 |
+
return re.sub(combined, "", text, flags=re.IGNORECASE).strip()
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _sentences_from_series(series: pd.Series) -> list[str]:
|
| 71 |
+
raw = series.dropna().str.cat(sep=" ")
|
| 72 |
+
return list(filter(None, map(str.strip, re.split(r"(?<=[.!?])\s+", raw))))
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _nearest_centroids(embeddings: np.ndarray, labels: np.ndarray, n: int = 5):
|
| 76 |
+
unique_labels = np.unique(labels)
|
| 77 |
+
centroids = np.array(list(map(
|
| 78 |
+
lambda lbl: embeddings[labels == lbl].mean(axis=0),
|
| 79 |
+
unique_labels,
|
| 80 |
+
)))
|
| 81 |
+
sim_matrix = cosine_similarity(centroids)
|
| 82 |
+
np.fill_diagonal(sim_matrix, -1)
|
| 83 |
+
nearest = list(map(
|
| 84 |
+
lambda i: unique_labels[np.argsort(sim_matrix[i])[::-1][:n]].tolist(),
|
| 85 |
+
range(len(unique_labels)),
|
| 86 |
+
))
|
| 87 |
+
return dict(zip(unique_labels.tolist(), nearest))
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _top_sentences(sentences: list[str], embeddings: np.ndarray,
|
| 91 |
+
centroid: np.ndarray, k: int = 5) -> list[str]:
|
| 92 |
+
sims = cosine_similarity([centroid], embeddings)[0]
|
| 93 |
+
top_idx = np.argsort(sims)[::-1][:k]
|
| 94 |
+
return list(map(lambda i: sentences[i], top_idx))
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# ---------------------------------------------------------------------------
|
| 98 |
+
# Tool 1 β load_scopus_csv
|
| 99 |
+
# ---------------------------------------------------------------------------
|
| 100 |
+
|
| 101 |
+
@tool
|
| 102 |
+
def load_scopus_csv(csv_path: str, run_config: str = "abstract") -> str:
|
| 103 |
+
"""Load a Scopus CSV file, count papers/sentences, apply boilerplate regex
|
| 104 |
+
filter, and return a JSON summary. run_config must be 'abstract' or 'title'."""
|
| 105 |
+
df = pd.read_csv(csv_path)
|
| 106 |
+
columns = RUN_CONFIGS[run_config]
|
| 107 |
+
available_cols = list(filter(lambda c: c in df.columns, columns))
|
| 108 |
+
texts = df[available_cols].fillna("").apply(
|
| 109 |
+
lambda row: " ".join(row.values.astype(str)), axis=1
|
| 110 |
+
)
|
| 111 |
+
import re
|
| 112 |
+
|
| 113 |
+
# Step 1: basic cleaning
|
| 114 |
+
cleaned = list(map(_clean_text, texts))
|
| 115 |
+
|
| 116 |
+
# Step 2: π₯ remove boilerplate noise (ADD HERE)
|
| 117 |
+
cleaned = list(map(
|
| 118 |
+
lambda x: re.sub(
|
| 119 |
+
r"Β©.*|all rights reserved|copyright.*|palgrave.*",
|
| 120 |
+
"",
|
| 121 |
+
x,
|
| 122 |
+
flags=re.I
|
| 123 |
+
),
|
| 124 |
+
cleaned
|
| 125 |
+
))
|
| 126 |
+
sentences = _sentences_from_series(pd.Series(cleaned))
|
| 127 |
+
df["_cleaned_text"] = cleaned
|
| 128 |
+
df.to_parquet(csv_path.replace(".csv", "_cleaned.parquet"), index=False)
|
| 129 |
+
summary = {
|
| 130 |
+
"csv_path": csv_path,
|
| 131 |
+
"run_config": run_config,
|
| 132 |
+
"columns_used": available_cols,
|
| 133 |
+
"total_papers": int(len(df)),
|
| 134 |
+
"total_sentences": len(sentences),
|
| 135 |
+
"sample_titles": df["Title"].head(5).tolist() if "Title" in df.columns else [],
|
| 136 |
+
}
|
| 137 |
+
Path("summaries.json").write_text(json.dumps(summary, indent=2))
|
| 138 |
+
return json.dumps(summary)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# ---------------------------------------------------------------------------
|
| 142 |
+
# Tool 2 β run_bertopic_discovery
|
| 143 |
+
# ---------------------------------------------------------------------------
|
| 144 |
+
|
| 145 |
+
@tool
|
| 146 |
+
def run_bertopic_discovery(parquet_path: str, run_config: str = "abstract") -> str:
|
| 147 |
+
"""Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering
|
| 148 |
+
(cosine, threshold=0.7), find 5 nearest centroids per cluster, generate 4
|
| 149 |
+
Plotly charts. Saves summaries.json + emb.npy. Returns topic summaries JSON."""
|
| 150 |
+
df = pd.read_parquet(parquet_path)
|
| 151 |
+
columns = RUN_CONFIGS[run_config]
|
| 152 |
+
available_cols = list(filter(lambda c: c in df.columns, columns))
|
| 153 |
+
texts = df[available_cols].fillna("").apply(
|
| 154 |
+
lambda row: " ".join(row.values.astype(str)), axis=1
|
| 155 |
+
)
|
| 156 |
+
sentences = _sentences_from_series(texts)
|
| 157 |
+
|
| 158 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 159 |
+
embeddings = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)
|
| 160 |
+
np.save("emb.npy", embeddings)
|
| 161 |
+
|
| 162 |
+
clustering = AgglomerativeClustering(
|
| 163 |
+
metric="cosine",
|
| 164 |
+
linkage="average",
|
| 165 |
+
distance_threshold=0.7,
|
| 166 |
+
n_clusters=None,
|
| 167 |
+
)
|
| 168 |
+
labels = clustering.fit_predict(embeddings)
|
| 169 |
+
|
| 170 |
+
unique_labels, counts = np.unique(labels, return_counts=True)
|
| 171 |
+
nearest = _nearest_centroids(embeddings, labels)
|
| 172 |
+
|
| 173 |
+
topic_summaries = list(map(
|
| 174 |
+
lambda pair: {
|
| 175 |
+
"topic_id": int(pair[0]),
|
| 176 |
+
"sentence_count": int(pair[1]),
|
| 177 |
+
"nearest_topics": nearest.get(int(pair[0]), []),
|
| 178 |
+
"top_sentences": _top_sentences(
|
| 179 |
+
sentences, embeddings,
|
| 180 |
+
embeddings[labels == pair[0]].mean(axis=0),
|
| 181 |
+
),
|
| 182 |
+
},
|
| 183 |
+
zip(unique_labels, counts),
|
| 184 |
+
))
|
| 185 |
+
|
| 186 |
+
# Sort by sentence count desc
|
| 187 |
+
topic_summaries.sort(key=lambda t: t["sentence_count"], reverse=True)
|
| 188 |
+
top100 = topic_summaries[:100]
|
| 189 |
+
|
| 190 |
+
# ---- Chart 1: Bar chart β top 20 topics by sentence count ----
|
| 191 |
+
top20 = top100[:20]
|
| 192 |
+
fig1 = px.bar(
|
| 193 |
+
x=[f"T{t['topic_id']}" for t in top20],
|
| 194 |
+
y=[t["sentence_count"] for t in top20],
|
| 195 |
+
labels={"x": "Topic", "y": "Sentences"},
|
| 196 |
+
title="Top 20 Topics by Sentence Count",
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# ---- Chart 2: Treemap ----
|
| 200 |
+
fig2 = px.treemap(
|
| 201 |
+
names=[f"Topic {t['topic_id']}" for t in top100],
|
| 202 |
+
parents=["All"] * len(top100),
|
| 203 |
+
values=[t["sentence_count"] for t in top100],
|
| 204 |
+
title="Topic Distribution Treemap",
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# ---- Chart 3: Scatter (PCA 2D projection) ----
|
| 208 |
+
from sklearn.decomposition import PCA
|
| 209 |
+
pca = PCA(n_components=2)
|
| 210 |
+
coords = pca.fit_transform(embeddings)
|
| 211 |
+
fig3 = go.Figure(go.Scatter(
|
| 212 |
+
x=coords[:, 0], y=coords[:, 1],
|
| 213 |
+
mode="markers",
|
| 214 |
+
marker=dict(color=labels, colorscale="Viridis", size=4, opacity=0.6),
|
| 215 |
+
))
|
| 216 |
+
fig3.update_layout(title="Sentence Clusters (PCA 2D)")
|
| 217 |
+
|
| 218 |
+
# ---- Chart 4: Heatmap β top 10 topic cosine similarity ----
|
| 219 |
+
top10_ids = [t["topic_id"] for t in top100[:10]]
|
| 220 |
+
centroids10 = np.array(list(map(
|
| 221 |
+
lambda lbl: embeddings[labels == lbl].mean(axis=0),
|
| 222 |
+
top10_ids,
|
| 223 |
+
)))
|
| 224 |
+
sim10 = cosine_similarity(centroids10)
|
| 225 |
+
fig4 = px.imshow(
|
| 226 |
+
sim10,
|
| 227 |
+
x=[f"T{i}" for i in top10_ids],
|
| 228 |
+
y=[f"T{i}" for i in top10_ids],
|
| 229 |
+
color_continuous_scale="Blues",
|
| 230 |
+
title="Top-10 Topic Cosine Similarity Heatmap",
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
charts = {
|
| 234 |
+
"bar_top20": fig1.to_json(),
|
| 235 |
+
"treemap": fig2.to_json(),
|
| 236 |
+
"scatter_pca": fig3.to_json(),
|
| 237 |
+
"heatmap": fig4.to_json(),
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
result = {
|
| 241 |
+
"total_clusters": int(len(unique_labels)),
|
| 242 |
+
"top100_topics": top100,
|
| 243 |
+
"charts_html": charts,
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
existing = json.loads(Path("summaries.json").read_text())
|
| 247 |
+
existing.update({"bertopic": {"total_clusters": result["total_clusters"]}})
|
| 248 |
+
Path("summaries.json").write_text(json.dumps(existing, indent=2))
|
| 249 |
+
Path("charts.json").write_text(json.dumps(charts, indent=2))
|
| 250 |
+
Path("topics.json").write_text(json.dumps(top100, indent=2))
|
| 251 |
+
|
| 252 |
+
return json.dumps({
|
| 253 |
+
"total_clusters": result["total_clusters"],
|
| 254 |
+
"top100_count": len(top100),
|
| 255 |
+
"charts_saved": list(charts.keys()),
|
| 256 |
+
})
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
# ---------------------------------------------------------------------------
|
| 260 |
+
# Tool 3 β label_topics_with_llm
|
| 261 |
+
# ---------------------------------------------------------------------------
|
| 262 |
+
|
| 263 |
+
@tool
|
| 264 |
+
def label_topics_with_llm(topics_json_path: str = "topics.json") -> str:
|
| 265 |
+
"""Send top-100 topics to Mistral via PromptTemplate + JsonOutputParser to
|
| 266 |
+
generate human-readable labels. Returns labelled topics JSON."""
|
| 267 |
+
topics = json.loads(Path(topics_json_path).read_text())
|
| 268 |
+
batch = topics[:100]
|
| 269 |
+
|
| 270 |
+
prompt = PromptTemplate.from_template(
|
| 271 |
+
"You are a qualitative research expert. Below are topic clusters from a "
|
| 272 |
+
"systematic literature review. For EACH topic assign a concise label "
|
| 273 |
+
"(3-6 words) and one sentence of reasoning.\n\n"
|
| 274 |
+
"Topics:\n{topics_text}\n\n"
|
| 275 |
+
"Return ONLY valid JSON: a list of objects with keys: "
|
| 276 |
+
"topic_id, label, reasoning. No markdown fences."
|
| 277 |
+
)
|
| 278 |
+
parser = JsonOutputParser()
|
| 279 |
+
chain = prompt | _MISTRAL | parser
|
| 280 |
+
|
| 281 |
+
topics_text = "\n".join(list(map(
|
| 282 |
+
lambda t: f"Topic {t['topic_id']} ({t['sentence_count']} sentences): "
|
| 283 |
+
+ " | ".join(t["top_sentences"][:2]),
|
| 284 |
+
batch,
|
| 285 |
+
)))
|
| 286 |
+
|
| 287 |
+
labelled = chain.invoke({"topics_text": topics_text})
|
| 288 |
+
label_map = {item["topic_id"]: item for item in labelled}
|
| 289 |
+
|
| 290 |
+
enriched = list(map(
|
| 291 |
+
lambda t: {**t, **label_map.get(t["topic_id"], {"label": f"Topic {t['topic_id']}", "reasoning": ""})},
|
| 292 |
+
batch,
|
| 293 |
+
))
|
| 294 |
+
|
| 295 |
+
Path("labelled_topics.json").write_text(json.dumps(enriched, indent=2))
|
| 296 |
+
return json.dumps({"labelled_count": len(enriched), "path": "labelled_topics.json"})
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
# ---------------------------------------------------------------------------
|
| 300 |
+
# Tool 4 β consolidate_into_themes
|
| 301 |
+
# ---------------------------------------------------------------------------
|
| 302 |
+
|
| 303 |
+
@tool
|
| 304 |
+
def consolidate_into_themes(approved_groups_json: str) -> str:
|
| 305 |
+
"""Merge approved topic groups into themes, recompute centroids from emb.npy.
|
| 306 |
+
approved_groups_json: JSON list of {theme_name, topic_ids: [...]} objects."""
|
| 307 |
+
groups = json.loads(approved_groups_json)
|
| 308 |
+
embeddings = np.load("emb.npy")
|
| 309 |
+
topics = json.loads(Path("labelled_topics.json").read_text())
|
| 310 |
+
topic_id_to_sentences = {t["topic_id"]: t["top_sentences"] for t in topics}
|
| 311 |
+
|
| 312 |
+
themes = list(map(
|
| 313 |
+
lambda g: {
|
| 314 |
+
"theme_name": g["theme_name"],
|
| 315 |
+
"topic_ids": g["topic_ids"],
|
| 316 |
+
"top_sentences": sum(
|
| 317 |
+
list(map(lambda tid: topic_id_to_sentences.get(tid, []), g["topic_ids"])),
|
| 318 |
+
[],
|
| 319 |
+
)[:10],
|
| 320 |
+
"centroid": embeddings[
|
| 321 |
+
np.isin(np.arange(len(embeddings)), g["topic_ids"])
|
| 322 |
+
].mean(axis=0).tolist(),
|
| 323 |
+
},
|
| 324 |
+
groups,
|
| 325 |
+
))
|
| 326 |
+
|
| 327 |
+
Path("themes.json").write_text(json.dumps(themes, indent=2))
|
| 328 |
+
return json.dumps({"themes_count": len(themes), "theme_names": [t["theme_name"] for t in themes]})
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
# ---------------------------------------------------------------------------
|
| 332 |
+
# Tool 5 β compare_with_taxonomy
|
| 333 |
+
# ---------------------------------------------------------------------------
|
| 334 |
+
|
| 335 |
+
@tool
|
| 336 |
+
def compare_with_taxonomy(themes_json_path: str = "themes.json") -> str:
|
| 337 |
+
"""Map consolidated themes to PAJAIS 25 categories via Mistral.
|
| 338 |
+
Returns a mapping JSON."""
|
| 339 |
+
themes = json.loads(Path(themes_json_path).read_text())
|
| 340 |
+
|
| 341 |
+
prompt = PromptTemplate.from_template(
|
| 342 |
+
"You are an AI research taxonomist. Map each theme to the most relevant "
|
| 343 |
+
"PAJAIS category.\n\n"
|
| 344 |
+
"PAJAIS Categories:\n{categories}\n\n"
|
| 345 |
+
"Themes:\n{themes_text}\n\n"
|
| 346 |
+
"Return ONLY valid JSON: a list of objects with keys: "
|
| 347 |
+
"theme_name, pajais_category, confidence (0-1), rationale. No markdown."
|
| 348 |
+
)
|
| 349 |
+
parser = JsonOutputParser()
|
| 350 |
+
chain = prompt | _MISTRAL | parser
|
| 351 |
+
|
| 352 |
+
themes_text = "\n".join(list(map(
|
| 353 |
+
lambda t: f"- {t['theme_name']}: " + "; ".join(t["top_sentences"][:2]),
|
| 354 |
+
themes,
|
| 355 |
+
)))
|
| 356 |
+
|
| 357 |
+
mapping = chain.invoke({
|
| 358 |
+
"categories": "\n".join(list(map(lambda c: f" β’ {c}", PAJAIS_CATEGORIES))),
|
| 359 |
+
"themes_text": themes_text,
|
| 360 |
+
})
|
| 361 |
+
|
| 362 |
+
Path("taxonomy_mapping.json").write_text(json.dumps(mapping, indent=2))
|
| 363 |
+
return json.dumps({"mapped_count": len(mapping), "path": "taxonomy_mapping.json"})
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
# ---------------------------------------------------------------------------
|
| 367 |
+
# Tool 6 β generate_comparison_csv
|
| 368 |
+
# ---------------------------------------------------------------------------
|
| 369 |
+
|
| 370 |
+
@tool
|
| 371 |
+
def generate_comparison_csv(original_csv_path: str) -> str:
|
| 372 |
+
"""Generate a side-by-side comparison CSV of abstract vs title clustering
|
| 373 |
+
results for each paper. Returns path to output CSV."""
|
| 374 |
+
df = pd.read_csv(original_csv_path)
|
| 375 |
+
abstract_col = "Abstract" if "Abstract" in df.columns else None
|
| 376 |
+
title_col = "Title" if "Title" in df.columns else None
|
| 377 |
+
|
| 378 |
+
comparison = df[[c for c in [title_col, abstract_col] if c is not None]].copy()
|
| 379 |
+
comparison.columns = list(map(
|
| 380 |
+
lambda c: c + "_text",
|
| 381 |
+
[c for c in [title_col, abstract_col] if c is not None],
|
| 382 |
+
))
|
| 383 |
+
comparison.insert(0, "Paper_ID", range(1, len(df) + 1))
|
| 384 |
+
|
| 385 |
+
taxonomy_path = Path("taxonomy_mapping.json")
|
| 386 |
+
theme_label = list(map(
|
| 387 |
+
lambda _: "See themes.json for full mapping",
|
| 388 |
+
range(len(comparison)),
|
| 389 |
+
))
|
| 390 |
+
comparison["Theme_Assignment"] = theme_label
|
| 391 |
+
|
| 392 |
+
out_path = "comparison_abstract_vs_title.csv"
|
| 393 |
+
comparison.to_csv(out_path, index=False)
|
| 394 |
+
return json.dumps({"output_csv": out_path, "rows": len(comparison), "columns": comparison.columns.tolist()})
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
# ---------------------------------------------------------------------------
|
| 398 |
+
# Tool 7 β export_narrative
|
| 399 |
+
# ---------------------------------------------------------------------------
|
| 400 |
+
|
| 401 |
+
@tool
|
| 402 |
+
def export_narrative(context_json: str = "{}") -> str:
|
| 403 |
+
"""Generate a ~500-word Section 7 narrative via Mistral, synthesising all
|
| 404 |
+
prior analysis. context_json may contain extra instructions. Returns the
|
| 405 |
+
narrative text and saves it to narrative.md."""
|
| 406 |
+
context = json.loads(context_json)
|
| 407 |
+
themes = json.loads(Path("themes.json").read_text()) if Path("themes.json").exists() else []
|
| 408 |
+
mapping = json.loads(Path("taxonomy_mapping.json").read_text()) if Path("taxonomy_mapping.json").exists() else []
|
| 409 |
+
summaries = json.loads(Path("summaries.json").read_text()) if Path("summaries.json").exists() else {}
|
| 410 |
+
|
| 411 |
+
themes_summary = "\n".join(list(map(
|
| 412 |
+
lambda t: f"- **{t['theme_name']}**: " + "; ".join(t["top_sentences"][:1]),
|
| 413 |
+
themes,
|
| 414 |
+
)))
|
| 415 |
+
mapping_summary = "\n".join(list(map(
|
| 416 |
+
lambda m: f"- {m.get('theme_name','?')} β {m.get('pajais_category','?')} "
|
| 417 |
+
f"(confidence: {m.get('confidence', '?')})",
|
| 418 |
+
mapping,
|
| 419 |
+
)))
|
| 420 |
+
|
| 421 |
+
prompt = PromptTemplate.from_template(
|
| 422 |
+
"You are a senior academic researcher writing a systematic literature review. "
|
| 423 |
+
"Write Section 7 (Discussion & Synthesis) of approximately 500 words. "
|
| 424 |
+
"Use an academic tone, Braun & Clarke (2006) thematic analysis framing, "
|
| 425 |
+
"and reference the themes and PAJAIS taxonomy mappings provided.\n\n"
|
| 426 |
+
"Dataset summary:\n{summaries}\n\n"
|
| 427 |
+
"Themes identified:\n{themes}\n\n"
|
| 428 |
+
"PAJAIS taxonomy mapping:\n{mapping}\n\n"
|
| 429 |
+
"Extra context: {extra}\n\n"
|
| 430 |
+
"Write the section now. Use markdown headings."
|
| 431 |
+
)
|
| 432 |
+
chain = prompt | _MISTRAL
|
| 433 |
+
|
| 434 |
+
result = chain.invoke({
|
| 435 |
+
"summaries": json.dumps(summaries, indent=2),
|
| 436 |
+
"themes": themes_summary,
|
| 437 |
+
"mapping": mapping_summary,
|
| 438 |
+
"extra": context.get("extra_instructions", "None"),
|
| 439 |
+
})
|
| 440 |
+
|
| 441 |
+
narrative = result.content
|
| 442 |
+
Path("narrative.md").write_text(narrative)
|
| 443 |
+
return json.dumps({"narrative_path": "narrative.md", "word_count": len(narrative.split())})
|