aadisawant2912 commited on
Commit
bccf63d
Β·
verified Β·
1 Parent(s): ba97cd6

Create tools_v2.py

Browse files
Files changed (1) hide show
  1. tools_v2.py +642 -0
tools_v2.py ADDED
@@ -0,0 +1,642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ tools_v2.py - SPECTER2 + HDBSCAN + UMAP thematic analysis tools.
3
+ NEW in v2:
4
+ - Combined Title+Abstract text per paper (with DOI)
5
+ - SPECTER2 document-level embeddings (allenai/specter2_base)
6
+ - UMAP dimensionality reduction
7
+ - HDBSCAN density-based clustering (min 5, max 120 papers per cluster)
8
+ - Cosine similarity threshold 0.50-0.60
9
+ - Target 15-30 clusters (manageable for journal discussion)
10
+ - Council-of-3-LLMs labeling (Mistral + two prompt variants) β†’ mode vote
11
+ - Rich audit CSV: cluster assignments, 3 LLM decisions, final label,
12
+ top sentences, source paper titles
13
+ RULES: ZERO if/else, ZERO for/while, ZERO try/except, ZERO PromptTemplate.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import re
20
+ from pathlib import Path
21
+
22
+ import numpy as np
23
+ import pandas as pd
24
+ import plotly.express as px
25
+ from langchain_core.tools import tool
26
+ from langchain_core.messages import HumanMessage
27
+ from langchain_mistralai import ChatMistralAI
28
+
29
+ DATA_DIR = Path("data")
30
+ DATA_DIR.mkdir(exist_ok=True)
31
+
32
+ PAJAIS_CATEGORIES = [
33
+ "Information Systems Theory", "IS Strategy & Governance",
34
+ "Digital Innovation", "Enterprise Systems",
35
+ "AI & Intelligent Systems", "Big Data & Analytics",
36
+ "Cybersecurity & Privacy", "Cloud Computing",
37
+ "IS in Healthcare", "IS in Education",
38
+ "E-Commerce & Digital Markets", "Social Media & Platforms",
39
+ "Human-Computer Interaction", "IS Project Management",
40
+ "IT Outsourcing", "Knowledge Management",
41
+ "IS Development Methodologies", "Digital Transformation",
42
+ "IS Ethics & Society", "IS in Developing Countries",
43
+ "Mobile Computing", "IT Infrastructure",
44
+ "IS Adoption & Diffusion", "IS Evaluation",
45
+ "Organizational IS & Change",
46
+ ]
47
+
48
+ # ── lazy-load heavy models ─────────────────────────────────────────────────────
49
+ _SPECTER_MODEL = None
50
+ _UMAP_MODULE = None
51
+ _HDBSCAN_MODULE = None
52
+
53
+ def _get_specter():
54
+ global _SPECTER_MODEL
55
+ _ = None
56
+ from transformers import AutoTokenizer, AutoModel
57
+ import torch
58
+ # Use base specter2 which does not need adapters
59
+ MODEL_ID = "allenai/specter2_base"
60
+ print("Loading SPECTER2 model (first call)...")
61
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
62
+ model = AutoModel.from_pretrained(MODEL_ID)
63
+ model.eval()
64
+ _SPECTER_MODEL = (tokenizer, model)
65
+ print("SPECTER2 loaded.")
66
+ return _SPECTER_MODEL
67
+
68
+ def _embed_specter(texts: list[str]) -> np.ndarray:
69
+ import torch
70
+ tokenizer, model = _get_specter()
71
+ BATCH = 8
72
+ all_embs = []
73
+ batch_starts = list(range(0, len(texts), BATCH))
74
+ for start in batch_starts:
75
+ batch = texts[start: start + BATCH]
76
+ inputs = tokenizer(
77
+ batch, padding=True, truncation=True,
78
+ max_length=512, return_tensors="pt"
79
+ )
80
+ with torch.no_grad():
81
+ out = model(**inputs)
82
+ # CLS token embedding
83
+ emb = out.last_hidden_state[:, 0, :].numpy()
84
+ # L2 normalize
85
+ norms = np.linalg.norm(emb, axis=1, keepdims=True)
86
+ emb = emb / np.maximum(norms, 1e-9)
87
+ all_embs.append(emb)
88
+ return np.vstack(all_embs)
89
+
90
+ def _get_umap():
91
+ global _UMAP_MODULE
92
+ import umap as umap_mod
93
+ _UMAP_MODULE = umap_mod
94
+ return _UMAP_MODULE
95
+
96
+ def _get_hdbscan():
97
+ global _HDBSCAN_MODULE
98
+ import hdbscan as hdbscan_mod
99
+ _HDBSCAN_MODULE = hdbscan_mod
100
+ return _HDBSCAN_MODULE
101
+
102
+
103
+ def _p2() -> dict:
104
+ """All file paths for v2 run."""
105
+ d = DATA_DIR / "v2"
106
+ d.mkdir(parents=True, exist_ok=True)
107
+ return {
108
+ "dir": d,
109
+ "papers": d / "papers.json",
110
+ "embeddings": d / "embeddings.npy",
111
+ "umap_emb": d / "umap_emb.npy",
112
+ "clusters": d / "clusters.json",
113
+ "summaries": d / "summaries.json",
114
+ "taxonomy": d / "taxonomy.json",
115
+ "charts": d / "charts.json",
116
+ "audit_csv": d / "cluster_audit.csv",
117
+ "narrative": d / "narrative_v2.txt",
118
+ "comparison": DATA_DIR / "comparison_v2.csv",
119
+ }
120
+
121
+
122
+ def safe_read_csv(path):
123
+ try:
124
+ return pd.read_csv(path, encoding="utf-8")
125
+ except UnicodeDecodeError:
126
+ return pd.read_csv(path, encoding="latin-1")
127
+
128
+
129
+ def _call_llm_json(llm, prompt: str):
130
+ """Call LLM, strip markdown, parse JSON."""
131
+ response = llm.invoke([HumanMessage(content=prompt)])
132
+ raw = response.content.strip()
133
+ raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw
134
+ return json.loads(raw)
135
+
136
+
137
+ def _mode_label(labels: list[str]) -> str:
138
+ """Return most common string; ties broken by first occurrence."""
139
+ from collections import Counter
140
+ counts = Counter(labels)
141
+ return counts.most_common(1)[0][0]
142
+
143
+
144
+ # =============================================================================
145
+ # V2 TOOL 1 β€” load_and_embed_specter2
146
+ # =============================================================================
147
+ @tool
148
+ def load_and_embed_specter2(csv_path: str = "data/uploaded.csv") -> str:
149
+ """Load Scopus CSV, build combined Title+Abstract text per paper, embed with SPECTER2.
150
+ Saves papers metadata + embeddings to data/v2/.
151
+ Args:
152
+ csv_path: Path to uploaded Scopus CSV.
153
+ """
154
+ import time
155
+ p = _p2()
156
+ df = safe_read_csv(csv_path)
157
+
158
+ col_map = {c.strip().lower(): c for c in df.columns}
159
+ title_col = col_map.get("title", next(filter(lambda c: "title" in c.lower(), df.columns), None))
160
+ abstract_col = col_map.get("abstract", next(filter(lambda c: "abstract" in c.lower(), df.columns), None))
161
+ doi_col = col_map.get("doi", next(filter(lambda c: "doi" in c.lower(), df.columns), None))
162
+ year_col = col_map.get("year", next(filter(lambda c: "year" in c.lower(), df.columns), None))
163
+ journal_col = next(filter(lambda c: "source" in c.lower(), df.columns), None)
164
+
165
+ titles = list(df[title_col].fillna("") if title_col else [""] * len(df))
166
+ abstracts = list(df[abstract_col].fillna("") if abstract_col else [""] * len(df))
167
+ dois = list(df[doi_col].fillna("") if doi_col else [""] * len(df))
168
+ years = list(df[year_col].fillna("") if year_col else [""] * len(df))
169
+ journals = list(df[journal_col].fillna("") if journal_col else [""] * len(df))
170
+
171
+ def make_combined(i):
172
+ t = str(titles[i]).strip()
173
+ a = str(abstracts[i]).strip()
174
+ return "{} {}".format(t, a).strip()
175
+
176
+ combined_texts = list(map(make_combined, list(range(len(df)))))
177
+
178
+ # Filter out rows with empty combined text
179
+ valid_mask = list(map(lambda t: len(t.split()) > 5, combined_texts))
180
+ valid_indices = [i for i, v in enumerate(valid_mask) if v]
181
+
182
+ papers = list(map(lambda i: {
183
+ "paper_idx": i,
184
+ "title": titles[i],
185
+ "abstract": abstracts[i],
186
+ "doi": dois[i],
187
+ "year": str(years[i]),
188
+ "journal": str(journals[i]),
189
+ "combined": combined_texts[i],
190
+ }, valid_indices))
191
+
192
+ p["papers"].write_text(json.dumps(papers, indent=2, ensure_ascii=False))
193
+
194
+ valid_texts = list(map(lambda i: combined_texts[i], valid_indices))
195
+ print("Embedding {} papers with SPECTER2...".format(len(valid_texts)))
196
+ embs = _embed_specter(valid_texts)
197
+ np.save(p["embeddings"], embs)
198
+
199
+ return json.dumps({
200
+ "total_papers": len(df),
201
+ "valid_papers": len(papers),
202
+ "embedding_dim": int(embs.shape[1]),
203
+ "note": "Combined Title+Abstract embedded with SPECTER2. Ready for UMAP+HDBSCAN.",
204
+ })
205
+
206
+
207
+ # =============================================================================
208
+ # V2 TOOL 2 β€” cluster_with_umap_hdbscan
209
+ # =============================================================================
210
+ @tool
211
+ def cluster_with_umap_hdbscan(
212
+ umap_neighbors: int = 15,
213
+ umap_min_dist: float = 0.05,
214
+ hdbscan_min_cluster_size: int = 5,
215
+ hdbscan_min_samples: int = 3,
216
+ ) -> str:
217
+ """Reduce SPECTER2 embeddings with UMAP then cluster with HDBSCAN.
218
+ Targets 15-30 clusters, each containing 5-120 papers.
219
+ Cosine metric throughout. Saves cluster assignments to data/v2/clusters.json.
220
+ Args:
221
+ umap_neighbors: UMAP n_neighbors (default 15).
222
+ umap_min_dist: UMAP min_dist (default 0.05 for tighter clusters).
223
+ hdbscan_min_cluster_size: Minimum papers per cluster (default 5).
224
+ hdbscan_min_samples: HDBSCAN min_samples for core points (default 3).
225
+ """
226
+ import time
227
+ p = _p2()
228
+ embs = np.load(p["embeddings"])
229
+ papers = json.loads(p["papers"].read_text())
230
+
231
+ umap_mod = _get_umap()
232
+ hdbscan_mod = _get_hdbscan()
233
+
234
+ print("Running UMAP (n={}, min_dist={})...".format(umap_neighbors, umap_min_dist))
235
+ reducer = umap_mod.UMAP(
236
+ n_components=5,
237
+ n_neighbors=umap_neighbors,
238
+ min_dist=umap_min_dist,
239
+ metric="cosine",
240
+ random_state=42,
241
+ verbose=False,
242
+ )
243
+ umap_embs = reducer.fit_transform(embs)
244
+ np.save(p["umap_emb"], umap_embs)
245
+
246
+ print("Running HDBSCAN (min_cluster={})...".format(hdbscan_min_cluster_size))
247
+ clusterer = hdbscan_mod.HDBSCAN(
248
+ min_cluster_size=hdbscan_min_cluster_size,
249
+ min_samples=hdbscan_min_samples,
250
+ metric="euclidean",
251
+ cluster_selection_method="eom",
252
+ prediction_data=True,
253
+ )
254
+ labels = clusterer.fit_predict(umap_embs)
255
+ probs = clusterer.probabilities_
256
+
257
+ unique_clusters = sorted(set(labels.tolist()) - {-1})
258
+ n_clusters = len(unique_clusters)
259
+
260
+ print("HDBSCAN found {} clusters (excl. noise)".format(n_clusters))
261
+
262
+ # Build cluster records β€” filter to 5-120 papers
263
+ def build_cluster_record(cid):
264
+ mask = labels == cid
265
+ indices = [i for i, m in enumerate(mask.tolist()) if m]
266
+ cluster_papers = list(map(lambda i: papers[i], indices))
267
+ cluster_embs = embs[mask]
268
+ cluster_probs = probs[mask].tolist()
269
+ centroid = cluster_embs.mean(axis=0)
270
+ # Cosine similarity of each paper to centroid
271
+ norms = np.linalg.norm(cluster_embs, axis=1, keepdims=True)
272
+ normed = cluster_embs / np.maximum(norms, 1e-9)
273
+ c_norm = centroid / max(np.linalg.norm(centroid), 1e-9)
274
+ sims = (normed @ c_norm).tolist()
275
+ # Top 3 papers closest to centroid
276
+ top3_idx = sorted(range(len(sims)), key=lambda x: -sims[x])[:3]
277
+ return {
278
+ "cluster_id": cid,
279
+ "paper_count": int(mask.sum()),
280
+ "papers": cluster_papers,
281
+ "paper_indices": indices,
282
+ "hdbscan_probs": cluster_probs,
283
+ "centroid_sims": sims,
284
+ "centroid": centroid.tolist(),
285
+ "top3_paper_idx": top3_idx,
286
+ "top3_titles": list(map(lambda i: cluster_papers[i]["title"], top3_idx)),
287
+ "top3_abstracts": list(map(lambda i: cluster_papers[i]["abstract"][:200], top3_idx)),
288
+ }
289
+
290
+ all_clusters_raw = list(map(build_cluster_record, unique_clusters))
291
+ # Filter: keep clusters with 5-120 papers
292
+ valid_clusters = list(filter(
293
+ lambda c: 5 <= c["paper_count"] <= 120,
294
+ all_clusters_raw
295
+ ))
296
+ # If still outside 15-30, relax filter slightly β€” keep what we have
297
+ valid_clusters = sorted(valid_clusters, key=lambda c: -c["paper_count"])
298
+
299
+ # Renumber sequentially 1..N
300
+ def renumber(seq_pair):
301
+ seq_id, cluster = seq_pair
302
+ return {**cluster, "cluster_id": seq_id + 1}
303
+
304
+ valid_clusters = list(map(renumber, enumerate(valid_clusters)))
305
+
306
+ noise_count = int((labels == -1).sum())
307
+
308
+ # Build 2D UMAP for scatter chart
309
+ reducer_2d = umap_mod.UMAP(
310
+ n_components=2,
311
+ n_neighbors=umap_neighbors,
312
+ min_dist=umap_min_dist,
313
+ metric="cosine",
314
+ random_state=42,
315
+ verbose=False,
316
+ )
317
+ umap_2d = reducer_2d.fit_transform(embs)
318
+ cluster_ids_per_paper = labels.tolist()
319
+
320
+ chart_df = pd.DataFrame({
321
+ "x": umap_2d[:, 0].tolist(),
322
+ "y": umap_2d[:, 1].tolist(),
323
+ "cluster": list(map(str, cluster_ids_per_paper)),
324
+ "title": list(map(lambda pp: pp["title"][:50], papers)),
325
+ "prob": probs.tolist(),
326
+ })
327
+ fig = px.scatter(
328
+ chart_df, x="x", y="y", color="cluster",
329
+ hover_data=["title", "prob"],
330
+ title="UMAP + HDBSCAN Clusters ({} clusters, {} noise)".format(
331
+ len(valid_clusters), noise_count
332
+ ),
333
+ labels={"x": "UMAP-1", "y": "UMAP-2"},
334
+ )
335
+ fig_bar = px.bar(
336
+ x=list(map(lambda c: "C{}".format(c["cluster_id"]), valid_clusters)),
337
+ y=list(map(lambda c: c["paper_count"], valid_clusters)),
338
+ title="Papers per Cluster",
339
+ labels={"x": "Cluster", "y": "Papers"},
340
+ )
341
+ charts = {
342
+ "scatter": fig.to_html(full_html=False, include_plotlyjs="cdn"),
343
+ "bar": fig_bar.to_html(full_html=False, include_plotlyjs=False),
344
+ }
345
+ p["charts"].write_text(json.dumps(charts))
346
+
347
+ p["clusters"].write_text(json.dumps(valid_clusters, indent=2, ensure_ascii=False))
348
+
349
+ return json.dumps({
350
+ "clusters_found": len(valid_clusters),
351
+ "noise_papers": noise_count,
352
+ "total_papers": len(papers),
353
+ "cluster_sizes": list(map(lambda c: c["paper_count"], valid_clusters)),
354
+ "note": "Clusters 1..{}, 5-120 papers each. Ready for council-of-3 labeling.".format(
355
+ len(valid_clusters)
356
+ ),
357
+ })
358
+
359
+
360
+ # =============================================================================
361
+ # V2 TOOL 3 β€” label_clusters_council_of_3
362
+ # Council of 3 LLMs: Mistral-small Γ— 3 with distinct expert personas/prompts
363
+ # Mode vote on final label.
364
+ # =============================================================================
365
+ @tool
366
+ def label_clusters_council_of_3(batch_size: int = 5) -> str:
367
+ """Label clusters using council-of-3 LLMs (3 Mistral calls with distinct personas).
368
+ Uses top-3 sentences closest to centroid per cluster.
369
+ Final label = mode of 3 LLM responses.
370
+ Saves enriched summaries + audit CSV to data/v2/.
371
+ Args:
372
+ batch_size: Clusters per LLM call (default 5).
373
+ """
374
+ import time
375
+ p = _p2()
376
+ clusters = json.loads(p["clusters"].read_text())
377
+
378
+ # Three distinct expert personas for council voting
379
+ PERSONAS = [
380
+ {
381
+ "name": "IS_THEORY",
382
+ "system": (
383
+ "You are an Information Systems theory expert with 20 years of "
384
+ "systematic literature review experience. You label research clusters "
385
+ "using precise academic IS terminology. Your labels are 4-7 words, "
386
+ "noun-phrase style, highly specific to IS sub-domains."
387
+ ),
388
+ },
389
+ {
390
+ "name": "DIGITAL_MGT",
391
+ "system": (
392
+ "You are a digital management and organizational behavior scholar "
393
+ "specializing in technology adoption and digital transformation. "
394
+ "You label research clusters with managerial and practical framing. "
395
+ "Your labels are 4-7 words, action-oriented yet academic."
396
+ ),
397
+ },
398
+ {
399
+ "name": "COMP_SCI",
400
+ "system": (
401
+ "You are a computer science and AI researcher reviewing IS literature. "
402
+ "You label research clusters from a technical and systems perspective. "
403
+ "Your labels are 4-7 words, technically precise and domain-specific."
404
+ ),
405
+ },
406
+ ]
407
+
408
+ llm = ChatMistralAI(model="mistral-small-latest", temperature=0.2)
409
+
410
+ def make_prompt(persona_system: str, batch_clusters: list) -> str:
411
+ mini = list(map(lambda c: {
412
+ "cluster_id": c["cluster_id"],
413
+ "paper_count": c["paper_count"],
414
+ "top3_titles": c["top3_titles"],
415
+ "top3_abstracts": c["top3_abstracts"],
416
+ }, batch_clusters))
417
+ return (
418
+ persona_system + "\n\n"
419
+ "Label each research cluster below with a precise 4-7 word academic theme name.\n"
420
+ "The cluster_id values in this batch are: "
421
+ + str(list(map(lambda c: c["cluster_id"], batch_clusters))) + "\n\n"
422
+ "CLUSTERS:\n" + json.dumps(mini, indent=2) + "\n\n"
423
+ "Return ONLY a raw JSON array. Each element must have exactly:\n"
424
+ " cluster_id (integer), label (string 4-7 words), confidence (High/Medium/Low), "
425
+ "reasoning (one sentence).\n"
426
+ "No markdown, no explanation."
427
+ )
428
+
429
+ batch_starts = list(range(0, len(clusters), batch_size))
430
+
431
+ # Results from each of 3 personas: {cluster_id: {label, confidence, reasoning}}
432
+ persona_results = [{}, {}, {}]
433
+
434
+ for p_idx, persona in enumerate(PERSONAS):
435
+ all_labels = []
436
+ for b_idx, start in enumerate(batch_starts):
437
+ batch = clusters[start: start + batch_size]
438
+ prompt = make_prompt(persona["system"], batch)
439
+ result = _call_llm_json(llm, prompt)
440
+ all_labels.extend(result)
441
+ _ = time.sleep(10) if b_idx < len(batch_starts) - 1 else None
442
+ for item in all_labels:
443
+ cid = int(item.get("cluster_id", 0))
444
+ persona_results[p_idx][cid] = item
445
+ _ = time.sleep(15) if p_idx < len(PERSONAS) - 1 else None
446
+
447
+ # Council vote: mode of 3 labels per cluster
448
+ def enrich_cluster(cluster):
449
+ cid = cluster["cluster_id"]
450
+ votes = list(map(lambda pr: str(pr.get(cid, {}).get("label", "")).strip(), persona_results))
451
+ votes_clean = list(map(lambda v: v if v and v.lower() not in ("", "none", "null") else "Cluster {}".format(cid), votes))
452
+ final_label = _mode_label(votes_clean)
453
+ return {
454
+ **cluster,
455
+ "label": final_label,
456
+ "llm_vote_1_IS_THEORY": persona_results[0].get(cid, {}).get("label", ""),
457
+ "llm_vote_2_DIGITAL_MGT": persona_results[1].get(cid, {}).get("label", ""),
458
+ "llm_vote_3_COMP_SCI": persona_results[2].get(cid, {}).get("label", ""),
459
+ "confidence_1": persona_results[0].get(cid, {}).get("confidence", ""),
460
+ "confidence_2": persona_results[1].get(cid, {}).get("confidence", ""),
461
+ "confidence_3": persona_results[2].get(cid, {}).get("confidence", ""),
462
+ "reasoning_1": persona_results[0].get(cid, {}).get("reasoning", ""),
463
+ "reasoning_2": persona_results[1].get(cid, {}).get("reasoning", ""),
464
+ "reasoning_3": persona_results[2].get(cid, {}).get("reasoning", ""),
465
+ "vote_agreement": "unanimous" if len(set(votes_clean)) == 1 else (
466
+ "majority" if votes_clean.count(final_label) >= 2 else "split"
467
+ ),
468
+ }
469
+
470
+ enriched = list(map(enrich_cluster, clusters))
471
+ p["summaries"].write_text(json.dumps(enriched, indent=2, ensure_ascii=False))
472
+
473
+ # ── Build audit CSV ────────────────────────────────────────────────────────
474
+ # One row per paper-in-cluster
475
+ audit_rows = []
476
+ for cluster in enriched:
477
+ cid = cluster["cluster_id"]
478
+ for paper_local_idx, paper in enumerate(cluster["papers"]):
479
+ centroid_sim = (
480
+ cluster["centroid_sims"][paper_local_idx]
481
+ if paper_local_idx < len(cluster["centroid_sims"])
482
+ else 0.0
483
+ )
484
+ is_top3 = paper_local_idx in cluster["top3_paper_idx"]
485
+ audit_rows.append({
486
+ "cluster_id": cid,
487
+ "final_label": cluster["label"],
488
+ "vote_agreement": cluster["vote_agreement"],
489
+ "llm1_label_IS_THEORY": cluster["llm_vote_1_IS_THEORY"],
490
+ "llm2_label_DIGITAL_MGT": cluster["llm_vote_2_DIGITAL_MGT"],
491
+ "llm3_label_COMP_SCI": cluster["llm_vote_3_COMP_SCI"],
492
+ "llm1_confidence": cluster["confidence_1"],
493
+ "llm2_confidence": cluster["confidence_2"],
494
+ "llm3_confidence": cluster["confidence_3"],
495
+ "llm1_reasoning": cluster["reasoning_1"],
496
+ "llm2_reasoning": cluster["reasoning_2"],
497
+ "llm3_reasoning": cluster["reasoning_3"],
498
+ "paper_doi": paper.get("doi", ""),
499
+ "paper_title": paper.get("title", ""),
500
+ "paper_year": paper.get("year", ""),
501
+ "paper_journal": paper.get("journal", ""),
502
+ "paper_abstract": paper.get("abstract", "")[:300],
503
+ "combined_text": paper.get("combined", "")[:200],
504
+ "centroid_similarity": round(float(centroid_sim), 4),
505
+ "hdbscan_probability": round(
506
+ float(cluster["hdbscan_probs"][paper_local_idx])
507
+ if paper_local_idx < len(cluster["hdbscan_probs"]) else 0.0, 4
508
+ ),
509
+ "is_top3_centroid": "YES" if is_top3 else "no",
510
+ })
511
+
512
+ audit_df = pd.DataFrame(audit_rows)
513
+ p["audit_csv"].parent.mkdir(parents=True, exist_ok=True)
514
+ audit_df.to_csv(p["audit_csv"], index=False, encoding="utf-8-sig")
515
+
516
+ unanimous_count = sum(1 for c in enriched if c["vote_agreement"] == "unanimous")
517
+ majority_count = sum(1 for c in enriched if c["vote_agreement"] == "majority")
518
+
519
+ return json.dumps({
520
+ "clusters_labeled": len(enriched),
521
+ "unanimous_votes": unanimous_count,
522
+ "majority_votes": majority_count,
523
+ "split_votes": len(enriched) - unanimous_count - majority_count,
524
+ "audit_csv_path": str(p["audit_csv"]),
525
+ "audit_csv_rows": len(audit_rows),
526
+ "note": "Council-of-3 complete. Audit CSV ready for download.",
527
+ })
528
+
529
+
530
+ # =============================================================================
531
+ # V2 TOOL 4 β€” map_clusters_to_pajais_v2
532
+ # =============================================================================
533
+ @tool
534
+ def map_clusters_to_pajais_v2() -> str:
535
+ """Map v2 clusters to PAJAIS 25 categories via Mistral LLM.
536
+ Saves taxonomy to data/v2/taxonomy.json.
537
+ """
538
+ import time
539
+ p = _p2()
540
+ summaries = json.loads(p["summaries"].read_text())
541
+ llm = ChatMistralAI(model="mistral-small-latest", temperature=0.1)
542
+
543
+ theme_mini = list(map(lambda t: {
544
+ "name": t["label"],
545
+ "sample": t["top3_titles"][:2],
546
+ "cluster_id": t["cluster_id"],
547
+ }, summaries))
548
+
549
+ BATCH = 10
550
+ batch_starts = list(range(0, len(theme_mini), BATCH))
551
+ all_results = []
552
+
553
+ def process_batch(start):
554
+ batch = theme_mini[start: start + BATCH]
555
+ prompt = (
556
+ "Map each IS research cluster to the single most relevant PAJAIS category.\n\n"
557
+ "CLUSTERS:\n" + json.dumps(batch, indent=2) + "\n\n"
558
+ "PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n"
559
+ "Return ONLY a raw JSON array. Each element: "
560
+ "cluster_id, name, pajais_category, confidence, rationale. "
561
+ "No markdown, no explanation."
562
+ )
563
+ return _call_llm_json(llm, prompt)
564
+
565
+ for b_idx, start in enumerate(batch_starts):
566
+ all_results.extend(process_batch(start))
567
+ _ = time.sleep(10) if b_idx < len(batch_starts) - 1 else None
568
+
569
+ p["taxonomy"].write_text(json.dumps(all_results, indent=2, ensure_ascii=False))
570
+ return json.dumps({
571
+ "mapped_clusters": len(all_results),
572
+ "note": "PAJAIS taxonomy saved to data/v2/taxonomy.json",
573
+ })
574
+
575
+
576
+ # =============================================================================
577
+ # V2 TOOL 5 β€” export_v2_outputs
578
+ # Generates comparison_v2.csv and narrative_v2.txt
579
+ # =============================================================================
580
+ @tool
581
+ def export_v2_outputs() -> str:
582
+ """Generate final comparison CSV and narrative for v2 SPECTER2 run.
583
+ comparison_v2.csv: one row per paper with cluster, label, PAJAIS, DOI, etc.
584
+ narrative_v2.txt: 500-word Section 7 discussion.
585
+ """
586
+ p = _p2()
587
+ summaries = json.loads(p["summaries"].read_text())
588
+ taxonomy = json.loads(p["taxonomy"].read_text())
589
+ tax_map = {
590
+ str(item.get("cluster_id", "")): item.get("pajais_category", "")
591
+ for item in taxonomy
592
+ }
593
+ name_map = {
594
+ str(item.get("cluster_id", "")): item.get("name", item.get("pajais_category", ""))
595
+ for item in taxonomy
596
+ }
597
+
598
+ # Build comparison CSV from audit_csv (already per-paper)
599
+ audit_df = pd.read_csv(p["audit_csv"], encoding="utf-8-sig")
600
+ # Add PAJAIS column
601
+ def add_pajais(row):
602
+ cid = str(int(row["cluster_id"]))
603
+ return tax_map.get(cid, "Unknown")
604
+
605
+ audit_df["pajais_category"] = list(map(add_pajais, [audit_df.iloc[i] for i in range(len(audit_df))]))
606
+ out_path = p["comparison"]
607
+ audit_df.to_csv(out_path, index=False, encoding="utf-8-sig")
608
+
609
+ # Narrative
610
+ llm = ChatMistralAI(model="mistral-small-latest", temperature=0.4)
611
+ cluster_summary = list(map(lambda s: {
612
+ "cluster": s["cluster_id"],
613
+ "label": s["label"],
614
+ "papers": s["paper_count"],
615
+ "agreement": s["vote_agreement"],
616
+ }, summaries))
617
+
618
+ prompt = (
619
+ "You are an academic writing expert in Information Systems.\n\n"
620
+ "Write Section 7 (Discussion and Thematic Synthesis) for a systematic "
621
+ "literature review. ~500 words, formal academic prose.\n"
622
+ "The analysis used SPECTER2 embeddings + HDBSCAN clustering.\n"
623
+ "Cover: (a) Overview of clusters/themes found, (b) dominant PAJAIS categories, "
624
+ "(c) inter-cluster relationships, (d) implications for IS research, "
625
+ "(e) methodological contribution of SPECTER2+HDBSCAN vs. traditional BERTopic, "
626
+ "(f) limitations.\n\n"
627
+ "CLUSTERS:\n" + json.dumps(cluster_summary, indent=2) + "\n\n"
628
+ "PAJAIS MAPPING:\n" + json.dumps(taxonomy[:15], indent=2) + "\n\n"
629
+ "Write in continuous academic paragraphs. No bullet points or headers."
630
+ )
631
+ response = llm.invoke([HumanMessage(content=prompt)])
632
+ narrative = response.content
633
+ p["narrative"].write_text(narrative, encoding="utf-8")
634
+
635
+ return json.dumps({
636
+ "comparison_csv_rows": len(audit_df),
637
+ "comparison_csv_path": str(out_path),
638
+ "narrative_words": len(narrative.split()),
639
+ "narrative_path": str(p["narrative"]),
640
+ "clusters_in_csv": len(summaries),
641
+ "note": "All v2 outputs ready in data/v2/ and data/comparison_v2.csv",
642
+ })