Upload 4 files
Browse files- agent.py +541 -0
- app.py +413 -0
- requirements.txt +13 -0
- tools.py +623 -0
agent.py
ADDED
|
@@ -0,0 +1,541 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""agent.py β BERTopic Thematic Discovery Agent
|
| 2 |
+
Organized around Braun & Clarke's (2006) Reflexive Thematic Analysis.
|
| 3 |
+
Version 4.0.0 | 4 April 2026. ZERO for/while/if.
|
| 4 |
+
"""
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
|
| 7 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 8 |
+
# GOLDEN THREAD: How the agent executes Braun & Clarke's 6 phases
|
| 9 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 10 |
+
#
|
| 11 |
+
# π¬ BERTOPIC THEMATIC DISCOVERY AGENT
|
| 12 |
+
# β
|
| 13 |
+
# βββ 6 Tools listed upfront
|
| 14 |
+
# βββ 2 Run configs (abstract, all)
|
| 15 |
+
# βββ 4 Academic citations (B&C, Grootendorst, Campello, Reimers)
|
| 16 |
+
# β
|
| 17 |
+
# βΌ
|
| 18 |
+
# B&C PHASE 1: FAMILIARIZATION βββββββββββ Tool 1: load_scopus_csv
|
| 19 |
+
# β "Read and re-read the data"
|
| 20 |
+
# β Agent loads CSV β shows preview β ASKS before proceeding
|
| 21 |
+
# β WAIT βββ researcher confirms
|
| 22 |
+
# β
|
| 23 |
+
# βΌ
|
| 24 |
+
# B&C PHASE 2: INITIAL CODES ββββββββββββ Tool 2: run_bertopic_discovery
|
| 25 |
+
# β "Systematically coding features" Tool 3: label_topics_with_llm
|
| 26 |
+
# β Sentences β 384d vectors β AgglomerativeClustering cosine β codes
|
| 27 |
+
# β Mistral labels each code with evidence
|
| 28 |
+
# β WAIT βββ researcher reviews codes
|
| 29 |
+
# β β» re-run if needed
|
| 30 |
+
# β
|
| 31 |
+
# βΌ
|
| 32 |
+
# B&C PHASE 3: SEARCHING FOR THEMES ββββ Tool 4: consolidate_into_themes
|
| 33 |
+
# β "Collating codes into themes"
|
| 34 |
+
# β Agent proposes groupings with reasoning table
|
| 35 |
+
# β Researcher: "group 0 1 5" / "done"
|
| 36 |
+
# β Tool merges β new centroids β new evidence
|
| 37 |
+
# β WAIT βββ researcher approves themes
|
| 38 |
+
# β
|
| 39 |
+
# βΌ
|
| 40 |
+
# B&C PHASE 4: REVIEWING THEMES ββββββββ (conversation, no tool)
|
| 41 |
+
# β "Checking if themes work"
|
| 42 |
+
# β Agent checks ALL theme pairs for merge potential
|
| 43 |
+
# β Saturation: "No more merges because..."
|
| 44 |
+
# β Cites B&C: "when refinements add nothing, stop"
|
| 45 |
+
# β WAIT βββ researcher agrees iteration complete
|
| 46 |
+
# β β» back to Phase 3 if not saturated
|
| 47 |
+
# β
|
| 48 |
+
# βΌ
|
| 49 |
+
# B&C PHASE 5: DEFINING & NAMING ββββββββ (conversation, no tool)
|
| 50 |
+
# β "Clear definitions and names"
|
| 51 |
+
# β Agent presents final theme definitions
|
| 52 |
+
# β Researcher refines names
|
| 53 |
+
# β THEN repeat Phase 2-5 for second run config
|
| 54 |
+
# β
|
| 55 |
+
# βΌ
|
| 56 |
+
# PHASE 5.5: TAXONOMY COMPARISON ββββββββ Tool 5: compare_with_taxonomy
|
| 57 |
+
# β "Ground themes against PAJAIS taxonomy"
|
| 58 |
+
# β Mistral maps themes β PAJAIS categories or NOVEL
|
| 59 |
+
# β Researcher validates mapping
|
| 60 |
+
# β Novel themes = paper's contribution
|
| 61 |
+
# β
|
| 62 |
+
# βΌ
|
| 63 |
+
# B&C PHASE 6: PRODUCING REPORT ββββββββ Tool 6: generate_comparison_csv
|
| 64 |
+
# "Vivid extract examples, final analysis" Tool 7: export_narrative
|
| 65 |
+
# Cross-run comparison (abstract vs title)
|
| 66 |
+
# 500-word Section 7 draft
|
| 67 |
+
# Done β
|
| 68 |
+
#
|
| 69 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 70 |
+
|
| 71 |
+
SYSTEM_PROMPT = """
|
| 72 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 73 |
+
π¬ BERTOPIC THEMATIC DISCOVERY AGENT
|
| 74 |
+
Sentence-Level Topic Modeling with Researcher-in-the-Loop
|
| 75 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
+
|
| 77 |
+
You are a research assistant that performs thematic analysis on
|
| 78 |
+
Scopus academic paper exports using BERTopic + Mistral LLM.
|
| 79 |
+
|
| 80 |
+
Your workflow follows Braun & Clarke's (2006) six-phase Reflexive
|
| 81 |
+
Thematic Analysis framework β the gold standard for qualitative
|
| 82 |
+
research β enhanced with computational NLP at scale.
|
| 83 |
+
|
| 84 |
+
Golden thread: CSV β Sentences β Vectors β Clusters β Topics
|
| 85 |
+
β Themes β Saturation β Taxonomy Check β Synthesis β Report
|
| 86 |
+
|
| 87 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 88 |
+
β CRITICAL RULES
|
| 89 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
|
| 91 |
+
RULE 1: ONE PHASE PER MESSAGE
|
| 92 |
+
NEVER combine multiple phases in one response.
|
| 93 |
+
Present ONE phase β STOP β wait for approval β next phase.
|
| 94 |
+
|
| 95 |
+
RULE 2: ALL APPROVALS VIA REVIEW TABLE
|
| 96 |
+
The researcher approves/rejects/renames using the Results
|
| 97 |
+
Table below the chat β NOT by typing in chat.
|
| 98 |
+
|
| 99 |
+
Your workflow for EVERY phase:
|
| 100 |
+
1. Call the tool (saves JSON β table auto-refreshes)
|
| 101 |
+
2. Briefly explain what you did in chat (2-3 sentences)
|
| 102 |
+
3. End with: "**Review the table below. Edit Approve/Rename
|
| 103 |
+
columns, then click Submit Review to Agent.**"
|
| 104 |
+
4. STOP. Wait for the researcher's Submit Review.
|
| 105 |
+
|
| 106 |
+
NEVER present large tables or topic lists in chat text.
|
| 107 |
+
NEVER ask researcher to type "approve" in chat.
|
| 108 |
+
The table IS the approval interface.
|
| 109 |
+
|
| 110 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 111 |
+
YOUR 7 TOOLS
|
| 112 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 113 |
+
|
| 114 |
+
Tool 1: load_scopus_csv(filepath)
|
| 115 |
+
Load CSV, show columns, estimate sentence count.
|
| 116 |
+
|
| 117 |
+
Tool 2: run_bertopic_discovery(run_key, threshold)
|
| 118 |
+
Split β embed β AgglomerativeClustering cosine β centroid nearest 5 β Plotly charts.
|
| 119 |
+
|
| 120 |
+
Tool 3: label_topics_with_llm(run_key)
|
| 121 |
+
5 nearest centroid sentences β Mistral β label + research area + confidence.
|
| 122 |
+
|
| 123 |
+
Tool 4: consolidate_into_themes(run_key, theme_map)
|
| 124 |
+
Merge researcher-approved topic groups β recompute centroids β new evidence.
|
| 125 |
+
|
| 126 |
+
Tool 5: compare_with_taxonomy(run_key)
|
| 127 |
+
Compare themes against PAJAIS taxonomy (Jiang et al., 2019) β mapped vs NOVEL.
|
| 128 |
+
|
| 129 |
+
Tool 6: generate_comparison_csv()
|
| 130 |
+
Compare themes across abstract vs title runs.
|
| 131 |
+
|
| 132 |
+
Tool 7: export_narrative(run_key)
|
| 133 |
+
500-word Section 7 draft via Mistral.
|
| 134 |
+
|
| 135 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 136 |
+
RUN CONFIGURATIONS
|
| 137 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 138 |
+
|
| 139 |
+
"abstract" β Abstract sentences only (~10 per paper)
|
| 140 |
+
"title" β Title only (1 per paper, 1,390 total)
|
| 141 |
+
|
| 142 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 143 |
+
METHODOLOGY KNOWLEDGE (cite in conversation when relevant)
|
| 144 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 145 |
+
|
| 146 |
+
Braun & Clarke (2006), Qualitative Research in Psychology, 3(2), 77-101:
|
| 147 |
+
- 6-phase reflexive thematic analysis (the framework we follow)
|
| 148 |
+
- "Phases are not linear β move back and forth as required"
|
| 149 |
+
- "When refinements are not adding anything substantial, stop"
|
| 150 |
+
- Researcher is active interpreter, not passive receiver of themes
|
| 151 |
+
|
| 152 |
+
Grootendorst (2022), arXiv:2203.05794 β BERTopic:
|
| 153 |
+
- Modular: any embedding, any clustering, any dim reduction
|
| 154 |
+
- Supports AgglomerativeClustering as alternative to HDBSCAN
|
| 155 |
+
- c-TF-IDF extracts distinguishing words per cluster
|
| 156 |
+
- BERTopic uses AgglomerativeClustering internally for topic reduction
|
| 157 |
+
|
| 158 |
+
Ward (1963), JASA + Lance & Williams (1967) β Agglomerative Clustering:
|
| 159 |
+
- Groups by pairwise cosine similarity threshold
|
| 160 |
+
- No density estimation needed β works in ANY dimension (384d)
|
| 161 |
+
- distance_threshold controls granularity (lower = more topics)
|
| 162 |
+
- Every sentence assigned to a cluster (no outliers)
|
| 163 |
+
- 62-year-old algorithm, gold standard for hierarchical grouping
|
| 164 |
+
|
| 165 |
+
Reimers & Gurevych (2019), EMNLP β Sentence-BERT:
|
| 166 |
+
- all-MiniLM-L6-v2 produces 384d normalized vectors
|
| 167 |
+
- Cosine similarity = semantic relatedness
|
| 168 |
+
- Same meaning clusters together regardless of exact wording
|
| 169 |
+
|
| 170 |
+
PACIS/ICIS Research Categories:
|
| 171 |
+
IS Design Science, HCI, E-Commerce, Knowledge Management,
|
| 172 |
+
IT Governance, Digital Innovation, Social Computing, Analytics,
|
| 173 |
+
IS Security, Green IS, Health IS, IS Education, IT Strategy
|
| 174 |
+
|
| 175 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 176 |
+
B&C PHASE 1: FAMILIARIZATION WITH THE DATA
|
| 177 |
+
"Reading and re-reading, noting initial ideas"
|
| 178 |
+
Tool: load_scopus_csv
|
| 179 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 180 |
+
|
| 181 |
+
CRITICAL ERROR HANDLING:
|
| 182 |
+
- If message says "[No CSV uploaded yet]" β respond:
|
| 183 |
+
"π Please upload your Scopus CSV file first using the upload
|
| 184 |
+
button at the top. Then type 'Run abstract only' to begin."
|
| 185 |
+
DO NOT call any tools. DO NOT guess filenames.
|
| 186 |
+
- If a tool returns an error β explain the error clearly and
|
| 187 |
+
suggest what the researcher should do next.
|
| 188 |
+
|
| 189 |
+
When researcher uploads CSV or says "analyze":
|
| 190 |
+
|
| 191 |
+
1. Call load_scopus_csv(filepath) to inspect the data.
|
| 192 |
+
|
| 193 |
+
2. DO NOT run BERTopic yet. Present the data landscape:
|
| 194 |
+
|
| 195 |
+
"π **Phase 1: Familiarization** (Braun & Clarke, 2006)
|
| 196 |
+
|
| 197 |
+
Loaded [N] papers (~[M] sentences estimated)
|
| 198 |
+
Columns: Title β
| Abstract β
|
| 199 |
+
|
| 200 |
+
Sentence-level approach: each abstract splits into ~10
|
| 201 |
+
sentences, each becomes a 384d vector. One paper can
|
| 202 |
+
contribute to MULTIPLE topics.
|
| 203 |
+
|
| 204 |
+
I will run 2 configurations:
|
| 205 |
+
1οΈβ£ **Abstract only** β what papers FOUND (findings, methods, results)
|
| 206 |
+
2οΈβ£ **Title only** β what papers CLAIM to be about (author's framing)
|
| 207 |
+
|
| 208 |
+
βοΈ Defaults: threshold=0.7, cosine AgglomerativeClustering, 5 nearest
|
| 209 |
+
|
| 210 |
+
**Ready to proceed to Phase 2?**
|
| 211 |
+
β’ `run` β execute BERTopic discovery
|
| 212 |
+
β’ `run abstract` β single config
|
| 213 |
+
β’ `change threshold to 0.65` β more topics (stricter grouping)
|
| 214 |
+
β’ `change threshold to 0.8` β fewer topics (looser grouping)"
|
| 215 |
+
|
| 216 |
+
3. WAIT for researcher confirmation before proceeding.
|
| 217 |
+
|
| 218 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 219 |
+
B&C PHASE 2: GENERATING INITIAL CODES
|
| 220 |
+
"Systematically coding interesting features across the dataset"
|
| 221 |
+
Tools: run_bertopic_discovery β label_topics_with_llm
|
| 222 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 223 |
+
|
| 224 |
+
After researcher confirms:
|
| 225 |
+
|
| 226 |
+
1. Call run_bertopic_discovery(run_key, threshold)
|
| 227 |
+
β Splits papers into sentences (regex, min 30 chars)
|
| 228 |
+
β Filters publisher boilerplate (copyright, license text)
|
| 229 |
+
β Embeds with all-MiniLM-L6-v2 (384d, L2-normalized)
|
| 230 |
+
β AgglomerativeClustering cosine (no UMAP, no dimension reduction)
|
| 231 |
+
β Finds 5 nearest centroid sentences per topic
|
| 232 |
+
β Saves Plotly HTML visualizations
|
| 233 |
+
β Saves embeddings + summaries checkpoints
|
| 234 |
+
|
| 235 |
+
2. Immediately call label_topics_with_llm(run_key)
|
| 236 |
+
β Sends ALL topics with 5 evidence sentences to Mistral
|
| 237 |
+
β Returns: label + research area + confidence + niche
|
| 238 |
+
NOTE: NO PACIS categories in Phase 2. PACIS comparison comes in Phase 5.5.
|
| 239 |
+
|
| 240 |
+
3. Present CODED data with EVIDENCE under each topic:
|
| 241 |
+
|
| 242 |
+
"π **Phase 2: Initial Codes** β [N] codes from [M] sentences
|
| 243 |
+
|
| 244 |
+
**Code 0: Smart Tourism AI** [IS Design, high, 150 sent, 45 papers]
|
| 245 |
+
Evidence (5 nearest centroid sentences):
|
| 246 |
+
β "Neural networks predict tourist behavior..." β _Paper #42_
|
| 247 |
+
β "AI-powered systems optimize resource allocation..." β _Paper #156_
|
| 248 |
+
β "Deep learning models demonstrate superior accuracy..." β _Paper #78_
|
| 249 |
+
β "Machine learning classifies visitor patterns..." β _Paper #201_
|
| 250 |
+
β "ANN achieves 92% accuracy in demand forecasting..." β _Paper #89_
|
| 251 |
+
|
| 252 |
+
**Code 1: VR Destination Marketing** [HCI, high, 67 sent, 18 papers]
|
| 253 |
+
Evidence:
|
| 254 |
+
β ...
|
| 255 |
+
|
| 256 |
+
π 4 Plotly visualizations saved (download below)
|
| 257 |
+
|
| 258 |
+
**Review these codes. Ready for Phase 3 (theme search)?**
|
| 259 |
+
β’ `approve` β codes look good, move to theme grouping
|
| 260 |
+
β’ `re-run 0.65` β re-run with stricter threshold (more topics)
|
| 261 |
+
β’ `re-run 0.8` β re-run with looser threshold (fewer topics)
|
| 262 |
+
β’ `show topic 4 papers` β see all paper titles in topic 4
|
| 263 |
+
β’ `code 2 looks wrong` β I will show why it was labeled that way
|
| 264 |
+
|
| 265 |
+
π **Review Table columns explained:**
|
| 266 |
+
| Column | Meaning |
|
| 267 |
+
|--------|---------|
|
| 268 |
+
| # | Topic number |
|
| 269 |
+
| Topic Label | AI-generated name from 5 nearest sentences |
|
| 270 |
+
| Research Area | General research area (NOT PACIS β that comes later in Phase 5.5) |
|
| 271 |
+
| Confidence | How well the 5 sentences match the label |
|
| 272 |
+
| Sentences | Number of sentences clustered here |
|
| 273 |
+
| Papers | Number of unique papers contributing sentences |
|
| 274 |
+
| Approve | Edit: yes/no β keep or reject this topic |
|
| 275 |
+
| Rename To | Edit: type new name if label is wrong |
|
| 276 |
+
| Your Reasoning | Edit: why you renamed/rejected |"
|
| 277 |
+
|
| 278 |
+
4. β STOP HERE. Do NOT auto-proceed.
|
| 279 |
+
Say: "Codes generated. Review the table below.
|
| 280 |
+
Edit Approve/Rename columns, then click Submit Review to Agent."
|
| 281 |
+
|
| 282 |
+
5. If researcher types "show topic X papers":
|
| 283 |
+
β Load summaries.json from checkpoint
|
| 284 |
+
β Find topic X
|
| 285 |
+
β List ALL paper titles in that topic (from paper_titles field)
|
| 286 |
+
β Format as numbered list:
|
| 287 |
+
"π **Topic 4: AI in Tourism** β 64 papers:
|
| 288 |
+
1. Neural networks predict tourist behavior...
|
| 289 |
+
2. Deep learning for hotel revenue management...
|
| 290 |
+
3. AI-powered recommendation systems...
|
| 291 |
+
...
|
| 292 |
+
Want to see the 5 key evidence sentences? Type `show topic 4`"
|
| 293 |
+
|
| 294 |
+
6. If researcher types "show topic X":
|
| 295 |
+
β Show the 5 nearest centroid sentences with full paper titles
|
| 296 |
+
|
| 297 |
+
7. If researcher questions a code:
|
| 298 |
+
β Show the 5 sentences that generated the label
|
| 299 |
+
β Explain reasoning: "AgglomerativeClustering groups sentences
|
| 300 |
+
where cosine distance < threshold. These sentences share
|
| 301 |
+
semantic proximity in 384d space even if keywords differ."
|
| 302 |
+
β Offer re-run with adjusted parameters
|
| 303 |
+
|
| 304 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 305 |
+
B&C PHASE 3: SEARCHING FOR THEMES
|
| 306 |
+
"Collating codes into potential themes"
|
| 307 |
+
Tool: consolidate_into_themes
|
| 308 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 309 |
+
|
| 310 |
+
After researcher approves Phase 2 codes:
|
| 311 |
+
|
| 312 |
+
1. ANALYZE the labeled codes yourself. Look for:
|
| 313 |
+
β Codes with the SAME research area β likely one theme
|
| 314 |
+
β Codes with overlapping keywords in evidence β related
|
| 315 |
+
β Codes with shared papers across clusters β connected
|
| 316 |
+
β Codes that are sub-aspects of a broader concept β merge
|
| 317 |
+
β Codes that are niche/distinct β keep standalone
|
| 318 |
+
|
| 319 |
+
2. Present MAPPING TABLE with reasoning:
|
| 320 |
+
|
| 321 |
+
"π **Phase 3: Searching for Themes** (Braun & Clarke, 2006)
|
| 322 |
+
|
| 323 |
+
I analyzed [N] codes and propose [M] themes:
|
| 324 |
+
|
| 325 |
+
| Code (Phase 2) | β | Proposed Theme | Reasoning |
|
| 326 |
+
|---------------------------------|---|-----------------------|------------------------------|
|
| 327 |
+
| Code 0: Neural Network Tourism | β | AI & ML in Tourism | Same research area, |
|
| 328 |
+
| Code 1: Deep Learning Predict. | β | AI & ML in Tourism | shared methodology, |
|
| 329 |
+
| Code 5: ML Revenue Management | β | AI & ML in Tourism | Papers #42,#78 in all 3 |
|
| 330 |
+
| Code 2: VR Destination Mktg | β | VR & Metaverse | Both HCI category, |
|
| 331 |
+
| Code 3: Metaverse Experiences | β | VR & Metaverse | 'virtual reality' overlap |
|
| 332 |
+
| Code 4: Instagram Tourism | β | Social Media (alone) | Distinct platform focus |
|
| 333 |
+
| Code 8: Green Tourism | β | Sustainability (alone)| Niche, no overlap |
|
| 334 |
+
|
| 335 |
+
**Do you agree?**
|
| 336 |
+
β’ `agree` β consolidate as shown
|
| 337 |
+
β’ `group 4 6 call it Digital Marketing` β custom grouping
|
| 338 |
+
β’ `move code 5 to standalone` β adjust
|
| 339 |
+
β’ `split AI theme into two` β more granular"
|
| 340 |
+
|
| 341 |
+
3. β STOP HERE. Do NOT proceed to Phase 4.
|
| 342 |
+
Say: "Review the consolidated themes in the table below.
|
| 343 |
+
Edit Approve/Rename columns, then click Submit Review to Agent."
|
| 344 |
+
WAIT for the researcher's Submit Review.
|
| 345 |
+
|
| 346 |
+
4. ONLY after explicit approval, call:
|
| 347 |
+
consolidate_into_themes(run_key, {"AI & ML": [0,1,5], "VR": [2,3], ...})
|
| 348 |
+
|
| 349 |
+
5. Present consolidated themes with NEW centroid evidence:
|
| 350 |
+
|
| 351 |
+
"π― **Themes consolidated** (new centroids computed)
|
| 352 |
+
|
| 353 |
+
**Theme: AI & ML in Tourism** (294 sent, 83 papers)
|
| 354 |
+
Merged from: Codes 0, 1, 5
|
| 355 |
+
New evidence (recalculated after merge):
|
| 356 |
+
β "Neural networks predict tourist behavior..." β _Paper #42_
|
| 357 |
+
β "Deep learning optimizes hotel pricing..." β _Paper #78_
|
| 358 |
+
β ...
|
| 359 |
+
|
| 360 |
+
β
Themes look correct? Or adjust?"
|
| 361 |
+
|
| 362 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 363 |
+
B&C PHASE 4: REVIEWING THEMES
|
| 364 |
+
"Checking if themes work in relation to coded extracts
|
| 365 |
+
and the entire data set"
|
| 366 |
+
Tool: (conversation β no tool call, agent reasons)
|
| 367 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 368 |
+
|
| 369 |
+
After consolidation, perform SATURATION CHECK:
|
| 370 |
+
|
| 371 |
+
1. Analyze ALL theme pairs for remaining merge potential:
|
| 372 |
+
|
| 373 |
+
"π **Phase 4: Reviewing Themes** β Saturation Analysis
|
| 374 |
+
|
| 375 |
+
| Theme A | Theme B | Overlap | Merge? | Why |
|
| 376 |
+
|-------------|-------------|---------|--------|--------------------|
|
| 377 |
+
| AI & ML | VR Tourism | None | β | Different domains |
|
| 378 |
+
| AI & ML | ChatGPT | Low | β | GenAI β predictive |
|
| 379 |
+
| Social Media| VR Tourism | None | β | Different channels |
|
| 380 |
+
|
| 381 |
+
2. If NO themes can merge:
|
| 382 |
+
"β **Saturation reached** (per Braun & Clarke, 2006:
|
| 383 |
+
'when refinements are not adding anything substantial, stop')
|
| 384 |
+
|
| 385 |
+
Reasoning:
|
| 386 |
+
1. No remaining themes share a research area
|
| 387 |
+
2. No keyword overlap between any theme pair
|
| 388 |
+
3. Evidence sentences are semantically distinct
|
| 389 |
+
4. Further merging would lose research distinctions
|
| 390 |
+
|
| 391 |
+
**Do you agree iteration is complete?**
|
| 392 |
+
β’ `agree` β finalize, move to Phase 5
|
| 393 |
+
β’ `try merging X and Y` β override my recommendation"
|
| 394 |
+
|
| 395 |
+
3. If themes CAN still merge:
|
| 396 |
+
"π **Further consolidation possible:**
|
| 397 |
+
Themes 'Social Media' and 'Digital Marketing' share 3 keywords.
|
| 398 |
+
Suggest merging. Want me to consolidate?"
|
| 399 |
+
|
| 400 |
+
4. β STOP HERE. Do NOT proceed to Phase 5.
|
| 401 |
+
Say: "Saturation analysis complete. Review themes in the table.
|
| 402 |
+
Edit Approve/Rename columns, then click Submit Review to Agent."
|
| 403 |
+
|
| 404 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 405 |
+
B&C PHASE 5: DEFINING AND NAMING THEMES
|
| 406 |
+
"Generating clear definitions and names"
|
| 407 |
+
Tool: (conversation β agent + researcher co-create)
|
| 408 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 409 |
+
|
| 410 |
+
After saturation confirmed:
|
| 411 |
+
|
| 412 |
+
1. Present final theme definitions:
|
| 413 |
+
|
| 414 |
+
"π **Phase 5: Theme Definitions**
|
| 415 |
+
|
| 416 |
+
**Theme 1: AI & Machine Learning in Tourism**
|
| 417 |
+
Definition: Research applying predictive ML/DL methods
|
| 418 |
+
(neural networks, random forests, deep learning) to tourism
|
| 419 |
+
problems including demand forecasting, pricing optimization,
|
| 420 |
+
and visitor behavior classification.
|
| 421 |
+
Scope: 294 sentences across 83 papers.
|
| 422 |
+
Research area: technology adoption. Confidence: High.
|
| 423 |
+
|
| 424 |
+
**Theme 2: Virtual Reality & Metaverse Tourism**
|
| 425 |
+
Definition: ...
|
| 426 |
+
|
| 427 |
+
**Want to rename any theme? Adjust any definition?**"
|
| 428 |
+
|
| 429 |
+
2. β STOP HERE. Do NOT proceed to Phase 5.5 or second run.
|
| 430 |
+
Say: "Final theme names ready. Review in the table below.
|
| 431 |
+
Edit Rename To column if any names need changing, then click Submit Review."
|
| 432 |
+
|
| 433 |
+
3. ONLY after approval: repeat ALL of Phase 2-5 for the SECOND run config.
|
| 434 |
+
(If first run was "abstract", now run "title" β or vice versa)
|
| 435 |
+
|
| 436 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 437 |
+
PHASE 5.5: TAXONOMY COMPARISON
|
| 438 |
+
"Grounding themes against established IS research categories"
|
| 439 |
+
Tool: compare_with_taxonomy
|
| 440 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 441 |
+
|
| 442 |
+
After BOTH runs have finalized themes (Phase 5 complete for each):
|
| 443 |
+
|
| 444 |
+
1. Call compare_with_taxonomy(run_key) for each completed run.
|
| 445 |
+
β Mistral maps each theme to PAJAIS taxonomy (Jiang et al., 2019)
|
| 446 |
+
β Flags themes as MAPPED (known category) or NOVEL (emerging)
|
| 447 |
+
|
| 448 |
+
2. Present the mapping with researcher review:
|
| 449 |
+
|
| 450 |
+
"π **Phase 5.5: Taxonomy Comparison** (Jiang et al., 2019)
|
| 451 |
+
|
| 452 |
+
**Mapped to established PAJAIS categories:**
|
| 453 |
+
|
| 454 |
+
| Your Theme | β | PAJAIS Category | Confidence | Reasoning |
|
| 455 |
+
|---|---|---|---|---|
|
| 456 |
+
| AI & ML in Tourism | β | Business Intelligence & Analytics | high | ML/DL methods for prediction |
|
| 457 |
+
| VR & Metaverse | β | Human Behavior & HCI | high | Immersive technology interaction |
|
| 458 |
+
| Social Media Tourism | β | Social Media & Business Impact | high | Direct category match |
|
| 459 |
+
|
| 460 |
+
**π NOVEL themes (not in existing PAJAIS taxonomy):**
|
| 461 |
+
|
| 462 |
+
| Your Theme | Status | Reasoning |
|
| 463 |
+
|---|---|---|
|
| 464 |
+
| ChatGPT in Tourism | π NOVEL | Generative AI is post-2019, not in taxonomy |
|
| 465 |
+
| Sustainable AI Tourism | π NOVEL | Cross-cuts Green IT + Analytics |
|
| 466 |
+
|
| 467 |
+
These NOVEL themes represent **emerging research areas** that
|
| 468 |
+
extend beyond the established PAJAIS classification.
|
| 469 |
+
|
| 470 |
+
**Researcher: Review this mapping.**
|
| 471 |
+
β’ `approve` β mapping is correct
|
| 472 |
+
β’ `theme X should map to Y instead` β adjust
|
| 473 |
+
β’ `merge novel themes into one` β consolidate emerging themes
|
| 474 |
+
β’ `this novel theme is actually part of [category]` β reclassify"
|
| 475 |
+
|
| 476 |
+
3. β STOP HERE. Do NOT proceed to Phase 6.
|
| 477 |
+
Say: "PAJAIS taxonomy mapping complete. Review in the table below.
|
| 478 |
+
Edit Approve column for any mappings you disagree with, then click Submit Review."
|
| 479 |
+
|
| 480 |
+
4. ONLY after approval, ask:
|
| 481 |
+
"Want me to consolidate any novel themes with existing ones?
|
| 482 |
+
Or keep them separate as evidence of emerging research areas?"
|
| 483 |
+
|
| 484 |
+
5. β STOP AGAIN. WAIT for this answer before generating report.
|
| 485 |
+
|
| 486 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 487 |
+
B&C PHASE 6: PRODUCING THE REPORT
|
| 488 |
+
"Selection of vivid, compelling extract examples"
|
| 489 |
+
Tools: generate_comparison_csv β export_narrative
|
| 490 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 491 |
+
|
| 492 |
+
After BOTH run configs have finalized themes:
|
| 493 |
+
|
| 494 |
+
1. Call generate_comparison_csv()
|
| 495 |
+
β Compares themes across abstract vs title configs
|
| 496 |
+
|
| 497 |
+
2. Say briefly in chat:
|
| 498 |
+
"Cross-run comparison complete. Check the Download tab for:
|
| 499 |
+
β’ comparison.csv β abstract vs title themes side by side
|
| 500 |
+
Review the themes in the table below.
|
| 501 |
+
Click Submit Review to confirm, then I'll generate the narrative."
|
| 502 |
+
|
| 503 |
+
3. β STOP. Wait for Submit Review.
|
| 504 |
+
|
| 505 |
+
4. After approval, call export_narrative(run_key)
|
| 506 |
+
β Mistral writes 500-word paper section referencing:
|
| 507 |
+
methodology, B&C phases, key themes, limitations
|
| 508 |
+
|
| 509 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 510 |
+
CRITICAL RULES
|
| 511 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 512 |
+
|
| 513 |
+
- ALWAYS follow B&C phases in order. Name each phase explicitly.
|
| 514 |
+
- ALWAYS wait for researcher confirmation between phases.
|
| 515 |
+
- ALWAYS show evidence sentences with paper metadata.
|
| 516 |
+
- ALWAYS cite B&C (2006) when discussing iteration or saturation.
|
| 517 |
+
- ALWAYS cite Grootendorst (2022) when explaining cluster behavior.
|
| 518 |
+
- ALWAYS call label_topics_with_llm before presenting topic labels.
|
| 519 |
+
- ALWAYS call compare_with_taxonomy before claiming PAJAIS mappings.
|
| 520 |
+
- Use threshold=0.7 as default (lower = more topics, higher = fewer).
|
| 521 |
+
- If too many topics (>200), suggest increasing threshold to 0.8.
|
| 522 |
+
- If too few topics (<20), suggest decreasing threshold to 0.6.
|
| 523 |
+
- NEVER skip Phase 4 saturation check or Phase 5.5 taxonomy comparison.
|
| 524 |
+
- NEVER proceed to Phase 6 without both runs completing Phase 5.5.
|
| 525 |
+
- NEVER invent topic labels β only present labels returned by Tool 3.
|
| 526 |
+
- NEVER cite paper IDs, titles, or sentences from memory β only from tool output.
|
| 527 |
+
- NEVER claim a theme is NOVEL or MAPPED without calling Tool 5 first.
|
| 528 |
+
- NEVER fabricate sentence counts or paper counts β only use tool-reported numbers.
|
| 529 |
+
- If a tool returns an error, explain clearly and continue.
|
| 530 |
+
- Keep responses concise. Tables + evidence, not paragraphs.
|
| 531 |
+
|
| 532 |
+
Current date: """ + datetime.now().strftime("%Y-%m-%d")
|
| 533 |
+
|
| 534 |
+
print(f">>> agent.py: SYSTEM_PROMPT loaded ({len(SYSTEM_PROMPT)} chars)")
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
def get_local_tools():
|
| 538 |
+
"""Load 7 BERTopic tools."""
|
| 539 |
+
print(">>> agent.py: loading tools...")
|
| 540 |
+
from tools import get_all_tools
|
| 541 |
+
return get_all_tools()
|
app.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""app.py β Compact Gradio + Mistral + BERTopic. Version 2.1.0 | 4 April 2026. ZERO for/while/if.
|
| 2 |
+
|
| 3 |
+
FUNCTIONS IN THIS FILE:
|
| 4 |
+
_latest_output() β Scans /tmp for newest rq4_* file β feeds download button
|
| 5 |
+
respond() β Core chat handler: takes message + history + file β yields agent response
|
| 6 |
+
gr.Blocks() β One-page UI: header + upload + chatbot + input + download
|
| 7 |
+
"""
|
| 8 |
+
import os
|
| 9 |
+
import glob
|
| 10 |
+
import gradio as gr
|
| 11 |
+
from langchain_mistralai import ChatMistralAI
|
| 12 |
+
from langgraph.prebuilt import create_react_agent
|
| 13 |
+
from langgraph.checkpoint.memory import MemorySaver
|
| 14 |
+
from agent import SYSTEM_PROMPT, get_local_tools
|
| 15 |
+
|
| 16 |
+
print(">>> app.py: imports complete")
|
| 17 |
+
|
| 18 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
# AGENT SETUP β Mistral brain + 5 BERTopic tools
|
| 20 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300)
|
| 22 |
+
tools = get_local_tools()
|
| 23 |
+
agent = create_react_agent(model=llm, tools=tools, prompt=SYSTEM_PROMPT, checkpointer=MemorySaver())
|
| 24 |
+
print(f">>> app.py: agent ready ({len(tools)} tools, Mistral Large)")
|
| 25 |
+
|
| 26 |
+
_msg_count = 0
|
| 27 |
+
_uploaded = {"path": ""}
|
| 28 |
+
|
| 29 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
# COMPACT HEADER β fits in ~60px
|
| 31 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
+
HEADER_HTML = """
|
| 33 |
+
<style>
|
| 34 |
+
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:wght@400;500;600;700&display=swap');
|
| 35 |
+
.gradio-container {font-family: 'DM Sans', sans-serif !important; padding: 4px 8px !important; max-width: 100% !important;}
|
| 36 |
+
footer {display: none !important;}
|
| 37 |
+
.section-box {border: 1px solid #e2e8f0; border-radius: 8px; padding: 8px 12px; margin-bottom: 6px; background: #fafbfc;}
|
| 38 |
+
.section-label {font-size: 0.75em; font-weight: 600; color: #64748b; text-transform: uppercase; letter-spacing: 0.5px; margin-bottom: 4px;}
|
| 39 |
+
</style>
|
| 40 |
+
<div style="background: linear-gradient(135deg, #0f172a 0%, #1e293b 50%, #334155 100%); border-radius: 8px; padding: 8px 16px; margin-bottom: 4px; color: white; display: flex; align-items: center; gap: 10px;">
|
| 41 |
+
<span style="font-size: 1.3em;">π¬</span>
|
| 42 |
+
<div>
|
| 43 |
+
<span style="font-size: 1em; font-weight: 700; color: #e0e0ff;">Topic Modelling β Agentic AI</span>
|
| 44 |
+
<span style="font-size: 0.65em; color: #94a3b8; margin-left: 8px;">Mistral π«π· Β· Cosine Clustering Β· 384d Β· B&C Thematic Analysis</span>
|
| 45 |
+
</div>
|
| 46 |
+
</div>
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
+
# FUNCTION 1: Find latest output for download
|
| 51 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 52 |
+
def _latest_output():
|
| 53 |
+
"""Scan /tmp for ALL rq4_* files, sorted by phase order.
|
| 54 |
+
Returns list of filepaths for gr.File download component."""
|
| 55 |
+
# Phase order: summaries β labels β themes β taxonomy β charts β comparison β narrative
|
| 56 |
+
phase_order = {"summaries": 1, "labels": 2, "themes": 3, "taxonomy": 4,
|
| 57 |
+
"emb": 0, "intertopic": 5, "bars": 6, "hierarchy": 7,
|
| 58 |
+
"heatmap": 8, "comparison": 9, "narrative": 10}
|
| 59 |
+
files = (glob.glob("/tmp/rq4_*.csv") + glob.glob("/tmp/rq4_*.html")
|
| 60 |
+
+ glob.glob("/tmp/rq4_*.txt") + glob.glob("/tmp/checkpoints/rq4_*.json"))
|
| 61 |
+
# Sort by phase order using filename keywords
|
| 62 |
+
scored = list(map(
|
| 63 |
+
lambda f: (sum(v * (k in f) for k, v in phase_order.items()), f), files))
|
| 64 |
+
scored.sort(key=lambda x: x[0])
|
| 65 |
+
return list(map(lambda x: x[1], scored)) or None
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _build_progress():
|
| 71 |
+
"""Build HTML progress pipeline showing which B&C phases are complete."""
|
| 72 |
+
checks = [
|
| 73 |
+
("β Load", bool(glob.glob("/tmp/checkpoints/rq4_*_summaries.json") or glob.glob("/tmp/checkpoints/rq4_*_emb.npy"))),
|
| 74 |
+
("β‘ Codes", bool(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))),
|
| 75 |
+
("β’ Themes", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))),
|
| 76 |
+
("β£ Review", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))),
|
| 77 |
+
("β€ Names", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))),
|
| 78 |
+
("β€Β½ PAJAIS", bool(glob.glob("/tmp/checkpoints/rq4_*_taxonomy_map.json"))),
|
| 79 |
+
("β₯ Report", bool(glob.glob("/tmp/rq4_comparison.csv") or glob.glob("/tmp/rq4_narrative.txt"))),
|
| 80 |
+
]
|
| 81 |
+
done_bg = "#22c55e"
|
| 82 |
+
todo_bg = "#e2e8f0"
|
| 83 |
+
done_fg = "white"
|
| 84 |
+
todo_fg = "#64748b"
|
| 85 |
+
items = " β ".join(list(map(
|
| 86 |
+
lambda c: (
|
| 87 |
+
"<span style='padding:3px 8px;border-radius:4px;"
|
| 88 |
+
+ "background:" + (done_bg * c[1] or todo_bg) + ";"
|
| 89 |
+
+ "color:" + (done_fg * c[1] or todo_fg) + ";"
|
| 90 |
+
+ "font-weight:600;font-size:0.8em;'>"
|
| 91 |
+
+ c[0] + " " + ("β
" * c[1] or "β¬") + "</span>"
|
| 92 |
+
), checks)))
|
| 93 |
+
return "<div style='padding:6px 0;text-align:center;'>" + items + "</div>"
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 97 |
+
# FUNCTION 2: Chat handler β core of the app
|
| 98 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 99 |
+
def respond(message, chat_history, uploaded_file):
|
| 100 |
+
"""Handle one chat turn:
|
| 101 |
+
1. Store uploaded file path (if new upload)
|
| 102 |
+
2. Append file context to message so agent knows where CSV is
|
| 103 |
+
3. Show progress bubble immediately (user sees instant feedback)
|
| 104 |
+
4. Invoke agent (Mistral brain decides which tools to call)
|
| 105 |
+
5. Replace progress bubble with agent's actual response
|
| 106 |
+
6. Update download link to latest output file
|
| 107 |
+
|
| 108 |
+
Uses single thread_id="session" so agent remembers across turns.
|
| 109 |
+
Agent asks clarification FIRST (via SYSTEM_PROMPT) before running heavy tools."""
|
| 110 |
+
global _msg_count
|
| 111 |
+
_msg_count += 1
|
| 112 |
+
|
| 113 |
+
# Store file path β no if/else, uses `or` short-circuit
|
| 114 |
+
_uploaded["path"] = uploaded_file or _uploaded.get("path", "")
|
| 115 |
+
|
| 116 |
+
# Guard: tell agent when no file uploaded (prevents hallucinated filepath)
|
| 117 |
+
file_note = (f"\n[CSV file at: {_uploaded['path']}]" * bool(_uploaded["path"])
|
| 118 |
+
) or "\n[No CSV uploaded yet β ask user to upload a file first]"
|
| 119 |
+
|
| 120 |
+
# Context: tell agent what phase we're in based on checkpoint files
|
| 121 |
+
phase_context = (
|
| 122 |
+
"\n[Phase context: labels exist]" * bool(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))
|
| 123 |
+
or "\n[Phase context: embeddings exist]" * bool(glob.glob("/tmp/checkpoints/rq4_*_emb.npy"))
|
| 124 |
+
or "\n[Phase context: fresh start]"
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
text = ((message or "").strip() or "Analyze my Scopus CSV") + file_note + phase_context
|
| 128 |
+
print(f"\n{'='*60}\n>>> MSG #{_msg_count}: '{text[:120]}'\n{'='*60}")
|
| 129 |
+
|
| 130 |
+
# Yield progress bubble immediately β user sees instant response
|
| 131 |
+
chat_history = chat_history + [
|
| 132 |
+
{"role": "user", "content": (message or "").strip()},
|
| 133 |
+
{"role": "assistant", "content": "π¬ **Working...** _Agent is thinking..._"},
|
| 134 |
+
]
|
| 135 |
+
yield chat_history, "", _latest_output()
|
| 136 |
+
|
| 137 |
+
# Shared session β agent remembers across messages (loaded CSV, phase, etc.)
|
| 138 |
+
# handle_tool_error=True on all tools prevents session poisoning from failures
|
| 139 |
+
result = agent.invoke(
|
| 140 |
+
{"messages": [("human", text)]},
|
| 141 |
+
config={"configurable": {"thread_id": "session"}},
|
| 142 |
+
)
|
| 143 |
+
response = result["messages"][-1].content
|
| 144 |
+
print(f">>> Response ({len(response)} chars)")
|
| 145 |
+
|
| 146 |
+
# Replace progress bubble with actual response
|
| 147 |
+
chat_history[-1] = {"role": "assistant", "content": response}
|
| 148 |
+
yield chat_history, "", _latest_output()
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 152 |
+
# UI LAYOUT β Everything fits ONE screen (~500px)
|
| 153 |
+
#
|
| 154 |
+
# ββββββββββββββββββββββββββββββββββββββ
|
| 155 |
+
# β π¬ BERTopic Agent (compact) β ~55px
|
| 156 |
+
# ββββββββββββββββββββββββββββββββββββββ€
|
| 157 |
+
# β [π Upload CSV] β ~35px
|
| 158 |
+
# ββββββββββββββββββββββββββββββββββββββ€
|
| 159 |
+
# β π¬ Chat bubbles (cloud style) β
|
| 160 |
+
# β User: "Analyze my data" β ~320px
|
| 161 |
+
# β Agent: "I found 1390 papers. β
|
| 162 |
+
# β Which config? All 3?" β
|
| 163 |
+
# ββββββββββββββββββββββββββββββββββββββ€
|
| 164 |
+
# β [Type message... ] [β] β ~35px
|
| 165 |
+
# ββββββββββββββββββββββββββββββββββββββ€
|
| 166 |
+
# β π₯ Download β ~35px
|
| 167 |
+
# ββββββββββββββββββββββββββββββββββββββ
|
| 168 |
+
# TOTAL: ~480px β fits any screen
|
| 169 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 170 |
+
print(">>> Building UI...")
|
| 171 |
+
with gr.Blocks(title="Topic Modelling β Agentic AI") as demo:
|
| 172 |
+
gr.HTML(HEADER_HTML)
|
| 173 |
+
|
| 174 |
+
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 175 |
+
# SECTION 1: DATA INPUT
|
| 176 |
+
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 177 |
+
gr.HTML('<div class="section-label">β DATA INPUT</div>')
|
| 178 |
+
with gr.Group():
|
| 179 |
+
with gr.Row():
|
| 180 |
+
upload = gr.File(label="οΏ½οΏ½οΏ½οΏ½ Upload Scopus CSV", file_types=[".csv"])
|
| 181 |
+
gr.Markdown("**Upload your CSV** then type `run abstract only` in the chat below")
|
| 182 |
+
|
| 183 |
+
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 184 |
+
# PHASE PROGRESS PIPELINE
|
| 185 |
+
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 186 |
+
phase_progress = gr.HTML(value=_build_progress())
|
| 187 |
+
|
| 188 |
+
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 189 |
+
# SECTION 2: AGENT CONVERSATION
|
| 190 |
+
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 191 |
+
gr.HTML('<div class="section-label">β‘ AGENT CONVERSATION β follow the prompts below</div>')
|
| 192 |
+
with gr.Group():
|
| 193 |
+
chatbot = gr.Chatbot(height=250, show_label=False,
|
| 194 |
+
placeholder="Upload your Scopus CSV above, then type: run abstract only")
|
| 195 |
+
with gr.Row():
|
| 196 |
+
msg = gr.Textbox(
|
| 197 |
+
placeholder="run Β· approve Β· show topic 4 papers Β· group 0 1 5 Β· done",
|
| 198 |
+
show_label=False, scale=9, lines=1, max_lines=1, container=False)
|
| 199 |
+
send = gr.Button("Send", variant="primary", scale=1, min_width=70)
|
| 200 |
+
|
| 201 |
+
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 202 |
+
# SECTION 3: RESULTS
|
| 203 |
+
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 204 |
+
gr.HTML('<div class="section-label">β’ RESULTS β review table, charts, downloads</div>')
|
| 205 |
+
with gr.Group():
|
| 206 |
+
with gr.Tabs():
|
| 207 |
+
with gr.Tab("π Review Table"):
|
| 208 |
+
gr.Markdown("*Edit Approve / Rename To / Your Reasoning β click Submit. "
|
| 209 |
+
"Type `show topic 4 papers` in chat to see paper list.*")
|
| 210 |
+
review_table = gr.Dataframe(
|
| 211 |
+
headers=["#", "Topic Label", "Top Evidence Sentence",
|
| 212 |
+
"Sentences", "Papers", "Approve", "Rename To", "Your Reasoning"],
|
| 213 |
+
datatype=["number", "str", "str", "number", "number", "str", "str", "str"],
|
| 214 |
+
column_count=(8, "fixed"),
|
| 215 |
+
interactive=True,
|
| 216 |
+
row_count=(1, "dynamic"),
|
| 217 |
+
)
|
| 218 |
+
submit_review = gr.Button("β
Submit Review to Agent", variant="primary")
|
| 219 |
+
|
| 220 |
+
# Paper viewer β select topic to see papers
|
| 221 |
+
gr.Markdown("---")
|
| 222 |
+
gr.Markdown("**π View papers in a topic:**")
|
| 223 |
+
with gr.Row():
|
| 224 |
+
topic_num = gr.Number(label="Topic #", value=0, precision=0, minimum=0, scale=1)
|
| 225 |
+
view_papers_btn = gr.Button("Show Papers", scale=1)
|
| 226 |
+
paper_list = gr.Textbox(label="Papers in selected topic", lines=8,
|
| 227 |
+
interactive=False)
|
| 228 |
+
|
| 229 |
+
with gr.Tab("π Charts"):
|
| 230 |
+
chart_selector = gr.Dropdown(choices=[], label="Select Chart", interactive=True)
|
| 231 |
+
chart_display = gr.HTML(
|
| 232 |
+
value="<div style='height:350px;display:flex;align-items:center;justify-content:center;"
|
| 233 |
+
"color:#94a3b8;border:1px dashed #cbd5e1;border-radius:8px;'>"
|
| 234 |
+
"Charts appear after BERTopic runs</div>")
|
| 235 |
+
|
| 236 |
+
with gr.Tab("π₯ Download"):
|
| 237 |
+
gr.Markdown(
|
| 238 |
+
"**Files by Phase (per run: abstract / title):**\n\n"
|
| 239 |
+
"**Phase 2 β Discovery:** `summaries.json` (raw topics) Β· `emb.npy` (embeddings)\n\n"
|
| 240 |
+
"**Phase 2 β Labeling:** `labels.json` (Mistral-labeled topics)\n\n"
|
| 241 |
+
"**Phase 2 β Charts:** `intertopic.html` Β· `bars.html` Β· `hierarchy.html` Β· `heatmap.html`\n\n"
|
| 242 |
+
"**Phase 3 β Themes:** `themes.json` (consolidated themes)\n\n"
|
| 243 |
+
"**Phase 5.5 β Taxonomy:** `taxonomy_map.json` (PAJAIS mapped vs NOVEL)\n\n"
|
| 244 |
+
"**Phase 6 β Report:** `comparison.csv` (abstract vs title) Β· `narrative.txt` (500-word draft)"
|
| 245 |
+
)
|
| 246 |
+
download = gr.File(label="All output files", file_count="multiple")
|
| 247 |
+
|
| 248 |
+
def _load_chart(chart_name):
|
| 249 |
+
"""Load chart HTML into iframe β gr.HTML strips scripts, iframe executes them."""
|
| 250 |
+
import html as html_mod
|
| 251 |
+
path = f"/tmp/{chart_name}"
|
| 252 |
+
content = os.path.exists(path) and open(path).read() or ""
|
| 253 |
+
escaped = html_mod.escape(content) * bool(content)
|
| 254 |
+
return (f'<iframe srcdoc="{escaped}" width="100%" height="450" '
|
| 255 |
+
f'frameborder="0" sandbox="allow-scripts allow-same-origin"></iframe>'
|
| 256 |
+
) * bool(escaped) or "<div style='padding:20px;color:#94a3b8;'>Select a chart above</div>"
|
| 257 |
+
|
| 258 |
+
def _get_chart_choices():
|
| 259 |
+
"""Find all rq4_*.html chart files in /tmp."""
|
| 260 |
+
files = sorted(glob.glob("/tmp/rq4_*.html"))
|
| 261 |
+
return list(map(os.path.basename, files))
|
| 262 |
+
|
| 263 |
+
def _load_review_table():
|
| 264 |
+
"""Load review table from latest phase JSON.
|
| 265 |
+
Priority: taxonomy_map > themes > labels > summaries.
|
| 266 |
+
Returns 8-column rows: #, Label, Evidence/Mapping, Sent, Papers, Approve, Rename, Reasoning"""
|
| 267 |
+
import json
|
| 268 |
+
taxonomy_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_taxonomy_map.json"))
|
| 269 |
+
theme_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))
|
| 270 |
+
label_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))
|
| 271 |
+
summary_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_summaries.json"))
|
| 272 |
+
|
| 273 |
+
# Determine which file to load
|
| 274 |
+
path = ((taxonomy_files and taxonomy_files[-1])
|
| 275 |
+
or (theme_files and theme_files[-1])
|
| 276 |
+
or (label_files and label_files[-1])
|
| 277 |
+
or (summary_files and summary_files[-1]) or "")
|
| 278 |
+
is_taxonomy = bool(taxonomy_files and taxonomy_files[-1] == path)
|
| 279 |
+
data = (os.path.exists(path) and json.load(open(path))) or []
|
| 280 |
+
|
| 281 |
+
# For taxonomy: merge with themes to get sentence/paper counts
|
| 282 |
+
theme_lookup = {}
|
| 283 |
+
(is_taxonomy and theme_files) and theme_lookup.update({
|
| 284 |
+
t.get("label", ""): t for t in json.load(open(theme_files[-1]))})
|
| 285 |
+
|
| 286 |
+
rows = list(map(
|
| 287 |
+
lambda pair: [
|
| 288 |
+
pair[0],
|
| 289 |
+
pair[1].get("label", pair[1].get("top_words", "")[:60]),
|
| 290 |
+
# Evidence column: show PAJAIS mapping for taxonomy, evidence sentence otherwise
|
| 291 |
+
(is_taxonomy and f"β {pair[1].get('pajais_match', '?')} | {pair[1].get('reasoning', '')}"[:120])
|
| 292 |
+
or (pair[1].get("nearest", [{}])[0].get("sentence", "")[:120] + "...") * bool(pair[1].get("nearest")),
|
| 293 |
+
# Sentence/paper counts: from taxonomy lookup or direct
|
| 294 |
+
theme_lookup.get(pair[1].get("label", ""), pair[1]).get("sentence_count", pair[1].get("sentence_count", 0)),
|
| 295 |
+
theme_lookup.get(pair[1].get("label", ""), pair[1]).get("paper_count", pair[1].get("paper_count", 0)),
|
| 296 |
+
"yes",
|
| 297 |
+
"",
|
| 298 |
+
"",
|
| 299 |
+
], enumerate(data)))
|
| 300 |
+
return rows or [[0, "No data yet", "", 0, 0, "", "", ""]]
|
| 301 |
+
|
| 302 |
+
def _show_papers(topic_id):
|
| 303 |
+
"""Show 5 nearest centroid sentences (evidence) + all paper titles for selected topic."""
|
| 304 |
+
import json
|
| 305 |
+
topic_id = int(topic_id)
|
| 306 |
+
summary_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_summaries.json"))
|
| 307 |
+
label_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))
|
| 308 |
+
all_files = label_files or summary_files
|
| 309 |
+
source_labels = list(map(
|
| 310 |
+
lambda f: os.path.basename(f).split("_")[1], all_files))
|
| 311 |
+
all_data = list(map(lambda f: json.load(open(f)), all_files))
|
| 312 |
+
|
| 313 |
+
lines = []
|
| 314 |
+
list(map(lambda pair: list(map(
|
| 315 |
+
lambda t: (t.get("topic_id") == topic_id) and lines.append(
|
| 316 |
+
f"βββ {pair[0].upper()} β Topic {topic_id}: "
|
| 317 |
+
f"{t.get('label', t.get('top_words','')[:50])} βββ\n"
|
| 318 |
+
f"{t.get('sentence_count', 0)} sentences from {t.get('paper_count', 0)} papers\n"
|
| 319 |
+
f"AI Reasoning: {t.get('reasoning', 'not yet labeled')}\n\n"
|
| 320 |
+
f"ββ 5 NEAREST CENTROID SENTENCES (evidence) ββ\n"
|
| 321 |
+
+ "\n".join(list(map(
|
| 322 |
+
lambda i: f" {i+1}. \"{t['nearest'][i]['sentence'][:200]}\"\n"
|
| 323 |
+
f" Paper: {t['nearest'][i].get('title', '')[:100]}",
|
| 324 |
+
range(min(5, len(t.get('nearest', [])))))))
|
| 325 |
+
+ "\n\nββ ALL PAPER TITLES ββ\n"
|
| 326 |
+
+ "\n".join(list(map(
|
| 327 |
+
lambda i: f" {i+1}. {t['paper_titles'][i]}",
|
| 328 |
+
range(len(t.get('paper_titles', []))))))
|
| 329 |
+
),
|
| 330 |
+
pair[1])),
|
| 331 |
+
zip(source_labels, all_data)))
|
| 332 |
+
|
| 333 |
+
return "\n\n".join(lines) or f"Topic {topic_id} not found."
|
| 334 |
+
|
| 335 |
+
view_papers_btn.click(_show_papers, [topic_num], [paper_list])
|
| 336 |
+
|
| 337 |
+
def _submit_review(table_data, chat_history):
|
| 338 |
+
"""Convert edited review table into a message for the agent.
|
| 339 |
+
Researcher's edits (approve/rename/reasoning) become natural language."""
|
| 340 |
+
rows = table_data.values.tolist()
|
| 341 |
+
lines = list(map(
|
| 342 |
+
lambda r: (
|
| 343 |
+
f"Topic {int(r[0])}: "
|
| 344 |
+
+ (f"RENAME to '{r[6]}'" * bool(str(r[6]).strip()))
|
| 345 |
+
+ (f"APPROVE '{r[1]}'" * (not bool(str(r[6]).strip())) * (str(r[5]).lower().startswith("y")))
|
| 346 |
+
+ (f"REJECT" * (str(r[5]).lower().startswith("n")))
|
| 347 |
+
+ (f" β reason: {r[7]}" * bool(str(r[7]).strip()))
|
| 348 |
+
), rows))
|
| 349 |
+
review_msg = "Review decisions:\n" + "\n".join(lines)
|
| 350 |
+
print(f">>> Review submitted: {review_msg[:200]}")
|
| 351 |
+
|
| 352 |
+
chat_history = chat_history + [
|
| 353 |
+
{"role": "user", "content": review_msg},
|
| 354 |
+
{"role": "assistant", "content": "π¬ **Processing review decisions...**"},
|
| 355 |
+
]
|
| 356 |
+
yield chat_history, _latest_output(), gr.update(), gr.update(), gr.update(), _build_progress()
|
| 357 |
+
|
| 358 |
+
result = agent.invoke(
|
| 359 |
+
{"messages": [("human", review_msg)]},
|
| 360 |
+
config={"configurable": {"thread_id": "session"}},
|
| 361 |
+
)
|
| 362 |
+
response = result["messages"][-1].content
|
| 363 |
+
chat_history[-1] = {"role": "assistant", "content": response}
|
| 364 |
+
# Reload table with updated themes/labels
|
| 365 |
+
table_data = _load_review_table()
|
| 366 |
+
yield (chat_history, _latest_output(),
|
| 367 |
+
gr.update(choices=_get_chart_choices()), gr.update(),
|
| 368 |
+
gr.update(value=table_data), _build_progress())
|
| 369 |
+
|
| 370 |
+
chart_selector.change(_load_chart, [chart_selector], [chart_display])
|
| 371 |
+
submit_review.click(_submit_review, [review_table, chatbot],
|
| 372 |
+
[chatbot, download, chart_selector, chart_display, review_table, phase_progress])
|
| 373 |
+
|
| 374 |
+
def respond_with_viz(message, chat_history, uploaded_file):
|
| 375 |
+
"""Wrap respond() and update chart dropdown + review table after each message."""
|
| 376 |
+
gen = respond(message, chat_history, uploaded_file)
|
| 377 |
+
# First yield (progress)
|
| 378 |
+
hist, txt, dl = next(gen)
|
| 379 |
+
yield hist, txt, dl, gr.update(choices=_get_chart_choices()), gr.update(), gr.update(), _build_progress()
|
| 380 |
+
# Second yield (final response + populate table + charts)
|
| 381 |
+
hist, txt, dl = next(gen)
|
| 382 |
+
choices = _get_chart_choices()
|
| 383 |
+
first_chart = (choices and _load_chart(choices[-1])) or gr.update()
|
| 384 |
+
table_data = _load_review_table()
|
| 385 |
+
yield (hist, txt, dl,
|
| 386 |
+
gr.update(choices=choices, value=(choices and choices[-1]) or None),
|
| 387 |
+
first_chart,
|
| 388 |
+
gr.update(value=table_data), _build_progress())
|
| 389 |
+
|
| 390 |
+
msg.submit(respond_with_viz, [msg, chatbot, upload],
|
| 391 |
+
[chatbot, msg, download, chart_selector, chart_display, review_table, phase_progress])
|
| 392 |
+
send.click(respond_with_viz, [msg, chatbot, upload],
|
| 393 |
+
[chatbot, msg, download, chart_selector, chart_display, review_table, phase_progress])
|
| 394 |
+
|
| 395 |
+
def _auto_load_csv(uploaded_file, chat_history):
|
| 396 |
+
"""Auto-trigger analysis when CSV is uploaded β stats appear without typing."""
|
| 397 |
+
gen = respond("Analyze my Scopus CSV", chat_history, uploaded_file)
|
| 398 |
+
hist, txt, dl = next(gen)
|
| 399 |
+
yield hist, dl, gr.update(), gr.update(), gr.update(), _build_progress()
|
| 400 |
+
hist, txt, dl = next(gen)
|
| 401 |
+
choices = _get_chart_choices()
|
| 402 |
+
first_chart = (choices and _load_chart(choices[-1])) or gr.update()
|
| 403 |
+
table_data = _load_review_table()
|
| 404 |
+
yield (hist, dl,
|
| 405 |
+
gr.update(choices=choices, value=(choices and choices[-1]) or None),
|
| 406 |
+
first_chart,
|
| 407 |
+
gr.update(value=table_data), _build_progress())
|
| 408 |
+
|
| 409 |
+
upload.change(_auto_load_csv, [upload, chatbot],
|
| 410 |
+
[chatbot, download, chart_selector, chart_display, review_table, phase_progress])
|
| 411 |
+
|
| 412 |
+
print(">>> Launching...")
|
| 413 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# requirements.txt v2.0 | 4 April 2026
|
| 2 |
+
# BERTopic + Mistral LLM (French, Apache 2.0, GDPR-safe)
|
| 3 |
+
langchain
|
| 4 |
+
langchain-mistralai
|
| 5 |
+
langgraph
|
| 6 |
+
langchain-core
|
| 7 |
+
bertopic
|
| 8 |
+
sentence-transformers
|
| 9 |
+
numpy
|
| 10 |
+
pandas
|
| 11 |
+
plotly
|
| 12 |
+
kaleido
|
| 13 |
+
gradio
|
tools.py
ADDED
|
@@ -0,0 +1,623 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tools.py β Sentence-level BERTopic pipeline + Mistral LLM. Version 3.0.0 | 4 April 2026. ZERO for/while/if.
|
| 2 |
+
|
| 3 |
+
PIPELINE:
|
| 4 |
+
Paper β split into sentences β each sentence gets paper_id + sent_id + metadata
|
| 5 |
+
β embed sentences (384d) β AgglomerativeClustering cosine β centroid nearest 5 sentences
|
| 6 |
+
β Mistral labels topics from sentence evidence + paper metadata
|
| 7 |
+
β one paper can span MULTIPLE topics
|
| 8 |
+
"""
|
| 9 |
+
from langchain_core.tools import tool
|
| 10 |
+
import os
|
| 11 |
+
import json
|
| 12 |
+
import re
|
| 13 |
+
import numpy as np
|
| 14 |
+
import pandas as pd
|
| 15 |
+
|
| 16 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
+
# DEBUG + STATE + CONSTANTS
|
| 18 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
DEBUG = True
|
| 20 |
+
debug = {True: print, False: lambda *a, **k: None}[DEBUG]
|
| 21 |
+
|
| 22 |
+
CHECKPOINT_DIR = "/tmp/checkpoints"
|
| 23 |
+
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
|
| 24 |
+
|
| 25 |
+
NEAREST_K = 5
|
| 26 |
+
SENT_SPLIT_RE = r'(?<=[.!?])\s+(?=[A-Z])'
|
| 27 |
+
MIN_SENT_LEN = 30
|
| 28 |
+
|
| 29 |
+
RUN_CONFIGS = {
|
| 30 |
+
"abstract": ["Abstract"],
|
| 31 |
+
"title": ["Title"],
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
_data = {}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
+
# HELPER: Split text into sentences (regex, no nltk)
|
| 39 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
def _split_sentences(text):
|
| 41 |
+
"""Split text on sentence boundaries. Filters short fragments (<30 chars).
|
| 42 |
+
Uses regex: split after .!? followed by uppercase letter."""
|
| 43 |
+
raw = re.split(SENT_SPLIT_RE, str(text))
|
| 44 |
+
return list(filter(lambda s: len(s.strip()) >= MIN_SENT_LEN, raw))
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 48 |
+
# TOOL 1: Load Scopus CSV
|
| 49 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
+
@tool
|
| 51 |
+
def load_scopus_csv(filepath: str) -> str:
|
| 52 |
+
"""Load a Scopus CSV export and show preview. Call this first.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
filepath: Path to the uploaded .csv file.
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
Row count, column names, and sample data."""
|
| 59 |
+
debug(f"\n>>> TOOL: load_scopus_csv(filepath='{filepath}')")
|
| 60 |
+
df = pd.read_csv(filepath, encoding="utf-8-sig")
|
| 61 |
+
_data["df"] = df
|
| 62 |
+
debug(f">>> Loaded {len(df)} rows, {len(df.columns)} columns")
|
| 63 |
+
target_cols = list(filter(lambda c: c in df.columns, ["Title", "Abstract", "Author Keywords"]))
|
| 64 |
+
sample = df[target_cols].head(3).to_string(max_colwidth=80)
|
| 65 |
+
null_counts = ", ".join(list(map(
|
| 66 |
+
lambda c: f"{c}: {df[c].notna().sum()}/{len(df)}", target_cols)))
|
| 67 |
+
|
| 68 |
+
# Estimate sentence counts
|
| 69 |
+
sample_sents = df["Abstract"].head(5).apply(_split_sentences).apply(len)
|
| 70 |
+
avg_abstract_sents = sample_sents.mean()
|
| 71 |
+
est_abstract = int(avg_abstract_sents * len(df))
|
| 72 |
+
title_count = int(df["Title"].notna().sum())
|
| 73 |
+
|
| 74 |
+
return (f"π **Dataset Statistics:**\n"
|
| 75 |
+
f"- **Papers:** {len(df)}\n"
|
| 76 |
+
f"- **Abstract sentences:** ~{est_abstract} (~{avg_abstract_sents:.0f} per paper)\n"
|
| 77 |
+
f"- **Title sentences:** {title_count} (1 per paper)\n"
|
| 78 |
+
f"- **Non-null:** {null_counts}\n\n"
|
| 79 |
+
f"Columns: {', '.join(list(df.columns)[:15])}\n\n"
|
| 80 |
+
f"Sample:\n{sample}")
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
+
# TOOL 2: Sentence-Level BERTopic Pipeline
|
| 85 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 86 |
+
@tool
|
| 87 |
+
def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str:
|
| 88 |
+
"""Sentence-level BERTopic: split papers β embed sentences β cosine similarity clustering β centroid nearest 5 β Plotly charts.
|
| 89 |
+
Each sentence keeps paper_id, sent_id, and metadata. One paper can span multiple topics.
|
| 90 |
+
Uses AgglomerativeClustering with cosine distance β groups sentences by similarity threshold.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
run_key: One of 'abstract' or 'title' β selects which columns to split into sentences.
|
| 94 |
+
threshold: Cosine distance threshold (0.0-1.0). Lower = stricter = more topics.
|
| 95 |
+
0.5 = very strict (~2000 topics), 0.7 = recommended (~100 topics, default), 0.8 = loose (~30 topics), 0.9 = very loose (~10 topics).
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Topic summary with sentence counts, paper counts, and 5 nearest centroid sentences."""
|
| 99 |
+
debug(f"\n>>> TOOL: run_bertopic_discovery(run_key='{run_key}', threshold={threshold})")
|
| 100 |
+
from bertopic import BERTopic
|
| 101 |
+
from sentence_transformers import SentenceTransformer
|
| 102 |
+
|
| 103 |
+
df = _data["df"].copy()
|
| 104 |
+
cols = RUN_CONFIGS[run_key]
|
| 105 |
+
available = list(filter(lambda c: c in df.columns, cols))
|
| 106 |
+
debug(f">>> Columns: {available}")
|
| 107 |
+
|
| 108 |
+
# ββ Step 1: Assemble text per paper ββ
|
| 109 |
+
df["_text"] = df[available].fillna("").agg(" ".join, axis=1)
|
| 110 |
+
df["_paper_id"] = df.index
|
| 111 |
+
debug(f">>> {len(df)} papers assembled")
|
| 112 |
+
|
| 113 |
+
# ββ Step 2: Split into sentences β regex, no nltk ββ
|
| 114 |
+
debug(">>> Splitting into sentences...")
|
| 115 |
+
df["_sentences"] = df["_text"].apply(_split_sentences)
|
| 116 |
+
debug(f">>> Sentence counts: min={df['_sentences'].apply(len).min()}, "
|
| 117 |
+
f"max={df['_sentences'].apply(len).max()}, "
|
| 118 |
+
f"mean={df['_sentences'].apply(len).mean():.1f}")
|
| 119 |
+
|
| 120 |
+
# ββ Step 3: Explode to sentence-level DataFrame ββ
|
| 121 |
+
meta_cols = ["_paper_id", "Title", "Author Keywords", "_sentences"]
|
| 122 |
+
available_meta = list(filter(lambda c: c in df.columns, meta_cols))
|
| 123 |
+
sent_df = df[available_meta].explode("_sentences").rename(
|
| 124 |
+
columns={"_sentences": "text"}).reset_index(drop=True)
|
| 125 |
+
sent_df = sent_df.dropna(subset=["text"]).reset_index(drop=True)
|
| 126 |
+
sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount()
|
| 127 |
+
|
| 128 |
+
# ββ Step 3b: Filter out publisher boilerplate sentences ββ
|
| 129 |
+
# Scopus abstracts contain copyright/license noise that clustering picks up as topics.
|
| 130 |
+
# These are NOT research content β remove before embedding.
|
| 131 |
+
debug(">>> Filtering publisher boilerplate...")
|
| 132 |
+
_n_before = len(sent_df)
|
| 133 |
+
boilerplate_patterns = "|".join([
|
| 134 |
+
r"Licensee MDPI",
|
| 135 |
+
r"Published by Informa",
|
| 136 |
+
r"Published by Elsevier",
|
| 137 |
+
r"Taylor & Francis",
|
| 138 |
+
r"Copyright Β©",
|
| 139 |
+
r"Creative Commons",
|
| 140 |
+
r"open access article",
|
| 141 |
+
r"Inderscience Enterprises",
|
| 142 |
+
r"All rights reserved",
|
| 143 |
+
r"This is an open access",
|
| 144 |
+
r"distributed under the terms",
|
| 145 |
+
r"The Author\(s\)",
|
| 146 |
+
r"Springer Nature",
|
| 147 |
+
r"Emerald Publishing",
|
| 148 |
+
r"limitations and future",
|
| 149 |
+
r"limitations and implications",
|
| 150 |
+
r"limitations are discussed",
|
| 151 |
+
r"limitations have been discussed",
|
| 152 |
+
r"implications are discussed",
|
| 153 |
+
r"implications were discussed",
|
| 154 |
+
r"implications are presented",
|
| 155 |
+
r"concludes with .* implications",
|
| 156 |
+
])
|
| 157 |
+
clean_mask = ~sent_df["text"].str.contains(boilerplate_patterns, case=False, regex=True, na=False)
|
| 158 |
+
sent_df = sent_df[clean_mask].reset_index(drop=True)
|
| 159 |
+
sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount()
|
| 160 |
+
debug(f">>> Filtered: {_n_before} β {len(sent_df)} sentences ({_n_before - len(sent_df)} boilerplate removed)")
|
| 161 |
+
n_sentences = len(sent_df)
|
| 162 |
+
n_papers = len(df)
|
| 163 |
+
debug(f">>> {n_sentences} sentences from {n_papers} papers")
|
| 164 |
+
|
| 165 |
+
# ββ Step 4: Embed sentences (384d, L2-normalized) ββ
|
| 166 |
+
# BERTopic FAQ: "normalize them first to force a cosine-related distance metric"
|
| 167 |
+
# Math: for L2-normalized vectors, euclideanΒ²(a,b) = 2(1 - cos(a,b)) β same clusters as cosine
|
| 168 |
+
debug(">>> Embedding sentences with all-MiniLM-L6-v2 (L2-normalized)...")
|
| 169 |
+
docs = sent_df["text"].tolist()
|
| 170 |
+
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
| 171 |
+
embeddings = embedder.encode(docs, show_progress_bar=False, normalize_embeddings=True)
|
| 172 |
+
debug(f">>> Embeddings: {embeddings.shape}, normalized: True")
|
| 173 |
+
|
| 174 |
+
# Save checkpoint
|
| 175 |
+
np.save(f"{CHECKPOINT_DIR}/rq4_{run_key}_emb.npy", embeddings)
|
| 176 |
+
|
| 177 |
+
# ββ Step 5: Agglomerative Clustering with COSINE similarity threshold ββ
|
| 178 |
+
# Groups sentences where cosine_distance < threshold β same cluster
|
| 179 |
+
# No dimension reduction. No density estimation. Pure similarity grouping.
|
| 180 |
+
debug(f">>> AgglomerativeClustering cosine threshold={threshold} on 384d embeddings...")
|
| 181 |
+
from sklearn.preprocessing import FunctionTransformer
|
| 182 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 183 |
+
no_umap = FunctionTransformer()
|
| 184 |
+
cluster_model = AgglomerativeClustering(
|
| 185 |
+
n_clusters=None,
|
| 186 |
+
metric="cosine",
|
| 187 |
+
linkage="average",
|
| 188 |
+
distance_threshold=threshold,
|
| 189 |
+
)
|
| 190 |
+
topic_model = BERTopic(
|
| 191 |
+
hdbscan_model=cluster_model,
|
| 192 |
+
umap_model=no_umap,
|
| 193 |
+
)
|
| 194 |
+
topics, probs = topic_model.fit_transform(docs, embeddings)
|
| 195 |
+
n_topics = len(set(topics)) - int(-1 in topics)
|
| 196 |
+
n_outliers = int(np.sum(np.array(topics) == -1))
|
| 197 |
+
debug(f">>> {n_topics} topics, {n_outliers} outlier sentences")
|
| 198 |
+
|
| 199 |
+
# Store for later tools
|
| 200 |
+
_data[f"{run_key}_model"] = topic_model
|
| 201 |
+
_data[f"{run_key}_topics"] = np.array(topics)
|
| 202 |
+
_data[f"{run_key}_embeddings"] = embeddings
|
| 203 |
+
_data[f"{run_key}_sent_df"] = sent_df
|
| 204 |
+
|
| 205 |
+
# ββ Step 6: BERTopic Plotly visualizations (skip charts that need 3+ topics) ββ
|
| 206 |
+
debug(f">>> Generating visualizations ({n_topics} topics)...")
|
| 207 |
+
# visualize_topics() uses UMAP internally β crashes with < 3 topics
|
| 208 |
+
(n_topics >= 3) and topic_model.visualize_topics().write_html(
|
| 209 |
+
f"/tmp/rq4_{run_key}_intertopic.html", include_plotlyjs="cdn")
|
| 210 |
+
# barchart works with 1+ topics
|
| 211 |
+
(n_topics >= 1) and topic_model.visualize_barchart(
|
| 212 |
+
top_n_topics=min(10, max(1, n_topics))).write_html(
|
| 213 |
+
f"/tmp/rq4_{run_key}_bars.html", include_plotlyjs="cdn")
|
| 214 |
+
# hierarchy needs 2+ topics
|
| 215 |
+
(n_topics >= 2) and topic_model.visualize_hierarchy().write_html(
|
| 216 |
+
f"/tmp/rq4_{run_key}_hierarchy.html", include_plotlyjs="cdn")
|
| 217 |
+
# heatmap needs 2+ topics
|
| 218 |
+
(n_topics >= 2) and topic_model.visualize_heatmap().write_html(
|
| 219 |
+
f"/tmp/rq4_{run_key}_heatmap.html", include_plotlyjs="cdn")
|
| 220 |
+
debug(f">>> Visualizations saved (skipped charts needing more topics)")
|
| 221 |
+
|
| 222 |
+
# ββ Step 7: Centroid nearest 5 SENTENCES β COSINE similarity ββ
|
| 223 |
+
topics_arr = np.array(topics)
|
| 224 |
+
topic_info = topic_model.get_topic_info()
|
| 225 |
+
valid_rows = list(filter(lambda r: r["Topic"] != -1, topic_info.to_dict("records")))
|
| 226 |
+
|
| 227 |
+
def _centroid_nearest(row):
|
| 228 |
+
"""Find 5 sentences nearest to topic centroid via cosine similarity."""
|
| 229 |
+
mask = topics_arr == row["Topic"]
|
| 230 |
+
member_idx = np.where(mask)[0]
|
| 231 |
+
member_embs = embeddings[mask]
|
| 232 |
+
centroid = member_embs.mean(axis=0)
|
| 233 |
+
# Cosine distance: 1 - cos_sim. For normalized vectors: cos_sim = dot product
|
| 234 |
+
norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid)
|
| 235 |
+
cosine_sim = (member_embs @ centroid) / (norms + 1e-10)
|
| 236 |
+
dists = 1 - cosine_sim
|
| 237 |
+
nearest = np.argsort(dists)[:NEAREST_K]
|
| 238 |
+
|
| 239 |
+
# 5 nearest sentences with paper metadata
|
| 240 |
+
nearest_evidence = list(map(lambda i: {
|
| 241 |
+
"sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250],
|
| 242 |
+
"paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]),
|
| 243 |
+
"title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150],
|
| 244 |
+
"keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150],
|
| 245 |
+
}, nearest))
|
| 246 |
+
|
| 247 |
+
# Count unique papers in this topic + collect their titles
|
| 248 |
+
topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"])
|
| 249 |
+
unique_papers = len(topic_papers_df)
|
| 250 |
+
paper_titles = list(map(
|
| 251 |
+
lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200],
|
| 252 |
+
range(min(50, unique_papers)))) # cap at 50 titles per topic
|
| 253 |
+
|
| 254 |
+
return {"topic_id": int(row["Topic"]),
|
| 255 |
+
"sentence_count": int(row["Count"]),
|
| 256 |
+
"paper_count": int(unique_papers),
|
| 257 |
+
"top_words": str(row.get("Name", ""))[:100],
|
| 258 |
+
"nearest": nearest_evidence,
|
| 259 |
+
"paper_titles": paper_titles}
|
| 260 |
+
|
| 261 |
+
summaries = list(map(_centroid_nearest, valid_rows))
|
| 262 |
+
json.dump(summaries, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json", "w"), indent=2, default=str)
|
| 263 |
+
debug(f">>> {len(summaries)} topics saved ({NEAREST_K} nearest sentences each)")
|
| 264 |
+
|
| 265 |
+
# ββ Format output ββ
|
| 266 |
+
lines = list(map(
|
| 267 |
+
lambda s: f" Topic {s['topic_id']} ({s['sentence_count']} sentences, {s['paper_count']} papers): {s['top_words']}",
|
| 268 |
+
summaries))
|
| 269 |
+
return (f"[{run_key}] {n_topics} topics from {n_sentences} sentences ({n_papers} papers, {n_outliers} outliers).\n\n"
|
| 270 |
+
+ "\n".join(lines)
|
| 271 |
+
+ f"\n\nVisualizations: /tmp/rq4_{run_key}_*.html (4 files)"
|
| 272 |
+
+ f"\nCheckpoints: {CHECKPOINT_DIR}/rq4_{run_key}_emb.npy + summaries.json")
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 276 |
+
# TOOL 3: Label Topics with Mistral (sentence evidence)
|
| 277 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 278 |
+
@tool
|
| 279 |
+
def label_topics_with_llm(run_key: str) -> str:
|
| 280 |
+
"""Send 5 nearest centroid sentences + paper metadata to Mistral for labeling.
|
| 281 |
+
Each sentence shows which paper it came from (title + keywords).
|
| 282 |
+
|
| 283 |
+
Args:
|
| 284 |
+
run_key: One of 'abstract' or 'title'.
|
| 285 |
+
|
| 286 |
+
Returns:
|
| 287 |
+
Labeled topics with sentence-level evidence."""
|
| 288 |
+
debug(f"\n>>> TOOL: label_topics_with_llm(run_key='{run_key}')")
|
| 289 |
+
from langchain_mistralai import ChatMistralAI
|
| 290 |
+
from langchain_core.prompts import PromptTemplate
|
| 291 |
+
from langchain_core.output_parsers import JsonOutputParser
|
| 292 |
+
|
| 293 |
+
summaries = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json"))
|
| 294 |
+
debug(f">>> Loaded {len(summaries)} topics ({NEAREST_K} sentences each)")
|
| 295 |
+
|
| 296 |
+
# Limit to top 50 largest topics β prevents Mistral rate limit on 2000+ topics
|
| 297 |
+
MAX_LABEL_TOPICS = 100
|
| 298 |
+
sorted_summaries = sorted(summaries, key=lambda s: s.get("sentence_count", 0), reverse=True)
|
| 299 |
+
summaries_to_label = sorted_summaries[:MAX_LABEL_TOPICS]
|
| 300 |
+
skipped = max(0, len(summaries) - MAX_LABEL_TOPICS)
|
| 301 |
+
debug(f">>> Labeling top {len(summaries_to_label)} topics (skipped {skipped} small clusters)")
|
| 302 |
+
|
| 303 |
+
# Format all topics β show sentence + paper metadata as evidence
|
| 304 |
+
topics_block = "\n\n".join(list(map(
|
| 305 |
+
lambda s: (f"Topic {s['topic_id']} ({s['sentence_count']} sentences from {s['paper_count']} papers):\n"
|
| 306 |
+
f" Top words: {s['top_words']}\n"
|
| 307 |
+
f" {NEAREST_K} nearest centroid sentences:\n"
|
| 308 |
+
+ "\n".join(list(map(
|
| 309 |
+
lambda e: (f" - \"{e['sentence'][:200]}\"\n"
|
| 310 |
+
f" Paper: \"{e['title']}\"\n"
|
| 311 |
+
f" Keywords: {e['keywords']}"),
|
| 312 |
+
s["nearest"])))),
|
| 313 |
+
summaries_to_label)))
|
| 314 |
+
|
| 315 |
+
prompt = PromptTemplate.from_template(
|
| 316 |
+
"You are a research topic classifier for academic papers about Technology and Tourism.\n\n"
|
| 317 |
+
"For EACH topic below, you are given the 5 sentences nearest to the topic centroid,\n"
|
| 318 |
+
"plus the paper title and author keywords each sentence came from.\n\n"
|
| 319 |
+
"Return a JSON ARRAY with one object per topic:\n"
|
| 320 |
+
"- topic_id: integer\n"
|
| 321 |
+
"- label: short descriptive name (3-6 words, specific β NOT generic like 'tourism studies')\n"
|
| 322 |
+
"- category: general research area (e.g., 'technology adoption', 'consumer behavior',\n"
|
| 323 |
+
" 'virtual reality', 'social media marketing', 'sustainability', 'cultural heritage',\n"
|
| 324 |
+
" 'AI and machine learning', 'online reviews', 'destination marketing',\n"
|
| 325 |
+
" 'tourist psychology', 'hotel management', 'sharing economy',\n"
|
| 326 |
+
" 'mobile applications', 'research methodology', 'data analytics')\n"
|
| 327 |
+
" DO NOT use PACIS/ICIS categories β just plain descriptive research area.\n"
|
| 328 |
+
"- confidence: high, medium, or low\n"
|
| 329 |
+
"- reasoning: 1 sentence explaining WHY you chose this label based on the evidence sentences\n"
|
| 330 |
+
"- niche: true or false (true = very specific sub-area with <20 sentences)\n\n"
|
| 331 |
+
"CRITICAL: be SPECIFIC in labels. Do NOT use broad terms.\n"
|
| 332 |
+
"Return ONLY valid JSON array, no markdown.\n\n"
|
| 333 |
+
"Topics:\n{topics}")
|
| 334 |
+
|
| 335 |
+
llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300)
|
| 336 |
+
chain = prompt | llm | JsonOutputParser()
|
| 337 |
+
debug(">>> Calling Mistral (single call, all topics)...")
|
| 338 |
+
labels = chain.invoke({"topics": topics_block})
|
| 339 |
+
debug(f">>> Got {len(labels)} labels")
|
| 340 |
+
|
| 341 |
+
# Merge labels with summaries
|
| 342 |
+
labeled = list(map(lambda pair: {**pair[0], **pair[1]},
|
| 343 |
+
zip(summaries, (labels + summaries)[:len(summaries)])))
|
| 344 |
+
json.dump(labeled, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json", "w"), indent=2, default=str)
|
| 345 |
+
debug(f">>> Labels saved: {CHECKPOINT_DIR}/rq4_{run_key}_labels.json")
|
| 346 |
+
|
| 347 |
+
# Format β show label + evidence sentences + paper source
|
| 348 |
+
lines = list(map(
|
| 349 |
+
lambda l: (f" **Topic {l.get('topic_id', '?')}: {l.get('label', '?')}** "
|
| 350 |
+
f"[{l.get('category', '?')}] conf={l.get('confidence', '?')} "
|
| 351 |
+
f"({l.get('sentence_count', 0)} sentences, {l.get('paper_count', 0)} papers)\n"
|
| 352 |
+
+ "\n".join(list(map(
|
| 353 |
+
lambda e: f" β \"{e['sentence'][:120]}...\" β _{e['title'][:60]}_",
|
| 354 |
+
l.get("nearest", []))))),
|
| 355 |
+
labeled))
|
| 356 |
+
return f"[{run_key}] {len(labeled)} topics labeled by Mistral:\n\n" + "\n\n".join(lines)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 360 |
+
# TOOL 4: Generate Comparison Table
|
| 361 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 362 |
+
@tool
|
| 363 |
+
def generate_comparison_csv() -> str:
|
| 364 |
+
"""Compare Mistral-labeled topics across completed runs. Includes sentence + paper counts.
|
| 365 |
+
|
| 366 |
+
Returns:
|
| 367 |
+
Comparison table + CSV path."""
|
| 368 |
+
debug(f"\n>>> TOOL: generate_comparison_csv()")
|
| 369 |
+
completed = list(filter(
|
| 370 |
+
lambda k: os.path.exists(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json"), RUN_CONFIGS.keys()))
|
| 371 |
+
debug(f">>> Completed runs: {completed}")
|
| 372 |
+
|
| 373 |
+
def _load_run(run_key):
|
| 374 |
+
labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json"))
|
| 375 |
+
return list(map(lambda l: {
|
| 376 |
+
"run": run_key, "topic_id": l.get("topic_id", ""),
|
| 377 |
+
"label": l.get("label", ""), "category": l.get("category", ""),
|
| 378 |
+
"confidence": l.get("confidence", ""), "niche": l.get("niche", ""),
|
| 379 |
+
"sentences": l.get("sentence_count", 0),
|
| 380 |
+
"papers": l.get("paper_count", 0),
|
| 381 |
+
"top_words": l.get("top_words", ""),
|
| 382 |
+
}, labels))
|
| 383 |
+
|
| 384 |
+
all_rows = sum(list(map(_load_run, completed)), [])
|
| 385 |
+
df = pd.DataFrame(all_rows)
|
| 386 |
+
path = "/tmp/rq4_comparison.csv"
|
| 387 |
+
df.to_csv(path, index=False)
|
| 388 |
+
debug(f">>> Comparison CSV: {path} ({len(df)} rows)")
|
| 389 |
+
return f"Comparison saved: {path} ({len(completed)} runs, {len(df)} topics)\n\n{df.to_string(index=False)}"
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 393 |
+
# TOOL 5: Export 500-Word Narrative
|
| 394 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 395 |
+
@tool
|
| 396 |
+
def export_narrative(run_key: str) -> str:
|
| 397 |
+
"""Generate 500-word narrative for research paper Section 7 via Mistral.
|
| 398 |
+
|
| 399 |
+
Args:
|
| 400 |
+
run_key: One of 'abstract' or 'title'.
|
| 401 |
+
|
| 402 |
+
Returns:
|
| 403 |
+
500-word narrative + save path."""
|
| 404 |
+
debug(f"\n>>> TOOL: export_narrative(run_key='{run_key}')")
|
| 405 |
+
from langchain_mistralai import ChatMistralAI
|
| 406 |
+
|
| 407 |
+
labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json"))
|
| 408 |
+
topics_text = "\n".join(list(map(
|
| 409 |
+
lambda l: f"- {l.get('label', '?')} ({l.get('sentence_count', 0)} sentences from "
|
| 410 |
+
f"{l.get('paper_count', 0)} papers, category: {l.get('category', '?')}, "
|
| 411 |
+
f"confidence: {l.get('confidence', '?')}, niche: {l.get('niche', '?')})",
|
| 412 |
+
labels)))
|
| 413 |
+
|
| 414 |
+
llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3, timeout=300)
|
| 415 |
+
result = llm.invoke(
|
| 416 |
+
f"Write exactly 500 words for a research paper Section 7 titled "
|
| 417 |
+
f"'Topic Modeling Results β BERTopic Discovery'.\n\n"
|
| 418 |
+
f"Dataset: 1390 Scopus papers on Tourism and AI.\n"
|
| 419 |
+
f"Method: Sentence-level BERTopic β each abstract split into sentences,\n"
|
| 420 |
+
f"embedded with all-MiniLM-L6-v2 (384d), clustered with AgglomerativeClustering (cosine).\n"
|
| 421 |
+
f"Note: One paper can contribute sentences to MULTIPLE topics.\n"
|
| 422 |
+
f"Run config: '{run_key}' columns.\n\n"
|
| 423 |
+
f"Topics discovered:\n{topics_text}\n\n"
|
| 424 |
+
f"Include: methodology justification for sentence-level approach,\n"
|
| 425 |
+
f"key themes, emerging niches, limitations, future work.")
|
| 426 |
+
|
| 427 |
+
path = "/tmp/rq4_narrative.txt"
|
| 428 |
+
open(path, "w", encoding="utf-8").write(result.content)
|
| 429 |
+
debug(f">>> Narrative saved: {path} ({len(result.content)} chars)")
|
| 430 |
+
return f"Narrative saved: {path}\n\n{result.content}"
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 434 |
+
# TOOL 6: Consolidate Round 1 Topics into Themes
|
| 435 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 436 |
+
@tool
|
| 437 |
+
def consolidate_into_themes(run_key: str, theme_map: dict) -> str:
|
| 438 |
+
"""ROUND 2: Merge fine-grained Round 1 topics into broader themes.
|
| 439 |
+
Researcher decides which topics to group. Recomputes centroids and evidence.
|
| 440 |
+
|
| 441 |
+
Args:
|
| 442 |
+
run_key: 'abstract' or 'title'.
|
| 443 |
+
theme_map: Dict mapping theme names to topic ID lists.
|
| 444 |
+
Example: {"AI in Tourism": [0, 1, 5], "VR Tourism": [2, 3]}
|
| 445 |
+
|
| 446 |
+
Returns:
|
| 447 |
+
Consolidated themes with new 5-nearest sentence evidence per theme."""
|
| 448 |
+
debug(f"\n>>> TOOL: consolidate_into_themes(run_key='{run_key}', {len(theme_map)} themes)")
|
| 449 |
+
|
| 450 |
+
topics_arr = _data[f"{run_key}_topics"]
|
| 451 |
+
embeddings = _data[f"{run_key}_embeddings"]
|
| 452 |
+
sent_df = _data[f"{run_key}_sent_df"]
|
| 453 |
+
|
| 454 |
+
def _build_theme(item):
|
| 455 |
+
"""Merge listed topics into one theme. Recompute centroid + 5 nearest."""
|
| 456 |
+
theme_name, topic_ids = item
|
| 457 |
+
mask = np.isin(topics_arr, topic_ids)
|
| 458 |
+
member_idx = np.where(mask)[0]
|
| 459 |
+
member_embs = embeddings[mask]
|
| 460 |
+
centroid = member_embs.mean(axis=0)
|
| 461 |
+
norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid)
|
| 462 |
+
cosine_sim = (member_embs @ centroid) / (norms + 1e-10)
|
| 463 |
+
dists = 1 - cosine_sim
|
| 464 |
+
nearest = np.argsort(dists)[:NEAREST_K]
|
| 465 |
+
|
| 466 |
+
nearest_evidence = list(map(lambda i: {
|
| 467 |
+
"sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250],
|
| 468 |
+
"paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]),
|
| 469 |
+
"title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150],
|
| 470 |
+
"keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150],
|
| 471 |
+
}, nearest))
|
| 472 |
+
|
| 473 |
+
unique_papers = sent_df.iloc[member_idx]["_paper_id"].nunique()
|
| 474 |
+
|
| 475 |
+
# Collect paper titles (up to 50)
|
| 476 |
+
topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"])
|
| 477 |
+
paper_titles = list(map(
|
| 478 |
+
lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200],
|
| 479 |
+
range(min(50, len(topic_papers_df)))))
|
| 480 |
+
|
| 481 |
+
return {"label": theme_name, "merged_topics": list(topic_ids),
|
| 482 |
+
"sentence_count": int(mask.sum()), "paper_count": int(unique_papers),
|
| 483 |
+
"nearest": nearest_evidence, "paper_titles": paper_titles}
|
| 484 |
+
|
| 485 |
+
# Add topic_id to each theme (sequential)
|
| 486 |
+
themes_raw = list(map(_build_theme, theme_map.items()))
|
| 487 |
+
themes = list(map(
|
| 488 |
+
lambda pair: {**pair[1], "topic_id": pair[0]},
|
| 489 |
+
enumerate(themes_raw)))
|
| 490 |
+
json.dump(themes, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json", "w"), indent=2, default=str)
|
| 491 |
+
debug(f">>> {len(themes)} themes saved: {CHECKPOINT_DIR}/rq4_{run_key}_themes.json")
|
| 492 |
+
|
| 493 |
+
# Format β show theme + merged topics + evidence
|
| 494 |
+
lines = list(map(
|
| 495 |
+
lambda t: (f" **{t['label']}** ({t['sentence_count']} sentences, {t['paper_count']} papers)\n"
|
| 496 |
+
f" Merged from topics: {t['merged_topics']}\n"
|
| 497 |
+
f" Evidence:\n"
|
| 498 |
+
+ "\n".join(list(map(
|
| 499 |
+
lambda e: f" β \"{e['sentence'][:120]}...\" β _{e['title'][:60]}_",
|
| 500 |
+
t["nearest"])))),
|
| 501 |
+
themes))
|
| 502 |
+
return f"[{run_key}] Round 2: {len(themes)} themes consolidated:\n\n" + "\n\n".join(lines)
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 506 |
+
# TOOL 7: Compare Themes with PAJAIS Taxonomy
|
| 507 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 508 |
+
|
| 509 |
+
# Established IS topic taxonomy from:
|
| 510 |
+
# Jiang, Liang & Tsai (2019) "Knowledge Profile in PAJAIS"
|
| 511 |
+
# Pacific Asia Journal of the AIS, 11(1), 1-24. doi:10.17705/1pais.11101
|
| 512 |
+
PAJAIS_TAXONOMY = [
|
| 513 |
+
"Electronic and Mobile Business / Social Commerce",
|
| 514 |
+
"Human Behavior and IS / Human-Computer Interaction",
|
| 515 |
+
"IS/IT Strategy, Leadership, Governance",
|
| 516 |
+
"Business Intelligence and Data Analytics",
|
| 517 |
+
"Design Science and IS",
|
| 518 |
+
"Enterprise Systems and BPM",
|
| 519 |
+
"IS Implementation, Adoption, and Diffusion",
|
| 520 |
+
"Social Media and Business Impact",
|
| 521 |
+
"Cultural and Global Issues in IS",
|
| 522 |
+
"IS Security and Privacy",
|
| 523 |
+
"IS Smart / IoT",
|
| 524 |
+
"Knowledge Management",
|
| 525 |
+
"ICT / Digital Platform / IT and Work",
|
| 526 |
+
"IS Healthcare",
|
| 527 |
+
"IT Project Management",
|
| 528 |
+
"Service Science and IS",
|
| 529 |
+
"Social and Organizational Aspects of IS",
|
| 530 |
+
"Research Methods and Philosophy",
|
| 531 |
+
"E-Finance / Economics of IS",
|
| 532 |
+
"E-Government",
|
| 533 |
+
"IS Education and Learning",
|
| 534 |
+
"Green IT and Sustainability",
|
| 535 |
+
]
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
@tool
|
| 539 |
+
def compare_with_taxonomy(run_key: str) -> str:
|
| 540 |
+
"""Compare BERTopic themes against established PAJAIS/PACIS taxonomy
|
| 541 |
+
(Jiang, Liang & Tsai, 2019). Identifies which themes map to known
|
| 542 |
+
categories and which are NOVEL/EMERGING (not in existing taxonomy).
|
| 543 |
+
Researcher reviews mapping and approves new theme consolidation.
|
| 544 |
+
|
| 545 |
+
Args:
|
| 546 |
+
run_key: 'abstract' or 'title'.
|
| 547 |
+
|
| 548 |
+
Returns:
|
| 549 |
+
Mapping table: BERTopic theme β PAJAIS category (or NOVEL)."""
|
| 550 |
+
debug(f"\n>>> TOOL: compare_with_taxonomy(run_key='{run_key}')")
|
| 551 |
+
from langchain_mistralai import ChatMistralAI
|
| 552 |
+
from langchain_core.prompts import PromptTemplate
|
| 553 |
+
from langchain_core.output_parsers import JsonOutputParser
|
| 554 |
+
|
| 555 |
+
# Load themes (prefer consolidated themes, fall back to labels)
|
| 556 |
+
themes_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json"
|
| 557 |
+
labels_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json"
|
| 558 |
+
source_path = (os.path.exists(themes_path) and themes_path) or labels_path
|
| 559 |
+
themes = json.load(open(source_path))
|
| 560 |
+
debug(f">>> Loaded {len(themes)} themes from {source_path}")
|
| 561 |
+
|
| 562 |
+
# Format themes for Mistral
|
| 563 |
+
themes_text = "\n".join(list(map(
|
| 564 |
+
lambda t: f"- {t.get('label', '?')} "
|
| 565 |
+
f"({t.get('paper_count', t.get('count', '?'))} papers)",
|
| 566 |
+
themes)))
|
| 567 |
+
|
| 568 |
+
taxonomy_text = "\n".join(list(map(lambda c: f"- {c}", PAJAIS_TAXONOMY)))
|
| 569 |
+
|
| 570 |
+
prompt = PromptTemplate.from_template(
|
| 571 |
+
"You are an IS research taxonomy expert.\n\n"
|
| 572 |
+
"Compare each BERTopic theme against the established PAJAIS/PACIS taxonomy.\n"
|
| 573 |
+
"For EACH theme, return a JSON ARRAY with:\n"
|
| 574 |
+
"- label: the BERTopic theme name\n"
|
| 575 |
+
"- pajais_match: closest PAJAIS category (or 'NOVEL' if no match)\n"
|
| 576 |
+
"- match_confidence: high, medium, low, or none\n"
|
| 577 |
+
"- reasoning: why this mapping (1 sentence)\n"
|
| 578 |
+
"- is_novel: true if this theme represents an emerging area not in the taxonomy\n\n"
|
| 579 |
+
"Return ONLY valid JSON array.\n\n"
|
| 580 |
+
"BERTopic Themes:\n{themes}\n\n"
|
| 581 |
+
"PAJAIS Taxonomy (Jiang et al., 2019):\n{taxonomy}")
|
| 582 |
+
|
| 583 |
+
llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300)
|
| 584 |
+
chain = prompt | llm | JsonOutputParser()
|
| 585 |
+
debug(">>> Calling Mistral for taxonomy comparison...")
|
| 586 |
+
mappings = chain.invoke({"themes": themes_text, "taxonomy": taxonomy_text})
|
| 587 |
+
debug(f">>> Got {len(mappings)} mappings")
|
| 588 |
+
|
| 589 |
+
# Save mapping
|
| 590 |
+
json.dump(mappings, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json", "w"), indent=2, default=str)
|
| 591 |
+
|
| 592 |
+
# Count novel vs mapped
|
| 593 |
+
novel = list(filter(lambda m: m.get("is_novel", False), mappings))
|
| 594 |
+
mapped = list(filter(lambda m: not m.get("is_novel", False), mappings))
|
| 595 |
+
|
| 596 |
+
# Format output
|
| 597 |
+
mapped_lines = list(map(
|
| 598 |
+
lambda m: f" β
{m.get('label', '?')} β **{m.get('pajais_match', '?')}** "
|
| 599 |
+
f"(conf={m.get('match_confidence', '?')}) _{m.get('reasoning', '')}_",
|
| 600 |
+
mapped))
|
| 601 |
+
novel_lines = list(map(
|
| 602 |
+
lambda m: f" π **{m.get('label', '?')}** β NOVEL "
|
| 603 |
+
f"_{m.get('reasoning', '')}_",
|
| 604 |
+
novel))
|
| 605 |
+
|
| 606 |
+
return (f"[{run_key}] Taxonomy comparison (Jiang et al., 2019):\n\n"
|
| 607 |
+
f"**Mapped to PAJAIS categories ({len(mapped)}):**\n" + "\n".join(mapped_lines) +
|
| 608 |
+
f"\n\n**NOVEL / Emerging themes ({len(novel)}):**\n" + "\n".join(novel_lines) +
|
| 609 |
+
f"\n\nSaved: {CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json")
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 613 |
+
# GET ALL TOOLS
|
| 614 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 615 |
+
def get_all_tools():
|
| 616 |
+
"""Return all 7 tools with error handling enabled."""
|
| 617 |
+
tools = [load_scopus_csv, run_bertopic_discovery, label_topics_with_llm,
|
| 618 |
+
consolidate_into_themes, compare_with_taxonomy,
|
| 619 |
+
generate_comparison_csv, export_narrative]
|
| 620 |
+
list(map(lambda t: setattr(t, 'handle_tool_error', True), tools))
|
| 621 |
+
debug(f">>> tools.py: {len(tools)} tools ready (handle_tool_error=True)")
|
| 622 |
+
list(map(lambda t: debug(f">>> - {t.name}"), tools))
|
| 623 |
+
return tools
|