aadisawant2912 commited on
Commit
f083a2e
Β·
verified Β·
1 Parent(s): a19ce45

Update tools_v2.py

Browse files
Files changed (1) hide show
  1. tools_v2.py +132 -464
tools_v2.py CHANGED
@@ -1,399 +1,157 @@
1
- """
2
- tools_v2.py - SPECTER2 + HDBSCAN + UMAP thematic analysis tools.
3
- COMPLETELY INDEPENDENT from tools.py (v1). No shared state, no ordering dependency.
4
- V2 can be run before, after, or without ever running V1.
5
-
6
- SPECTER2 is allenai/specter2_base β€” a local HuggingFace model.
7
- NO API KEY required. Downloads once, cached automatically.
8
-
9
- Pipeline:
10
- 1. Combined Title+Abstract per paper β†’ SPECTER2 embedding (768-dim)
11
- 2. UMAP (cosine, 5D) β†’ tight document clusters
12
- 3. HDBSCAN β†’ 15-30 clusters, 5-120 papers each
13
- 4. Council-of-3-LLMs β†’ 3 Mistral-small expert personas β†’ mode vote
14
- 5. PAJAIS mapping + audit CSV + narrative
15
- """
16
-
17
- from __future__ import annotations
18
-
19
- import json
20
- import io
21
- from pathlib import Path
22
-
23
- import numpy as np
24
- import pandas as pd
25
- import plotly.express as px
26
- from langchain_core.tools import tool
27
- from langchain_core.messages import HumanMessage
28
- from langchain_mistralai import ChatMistralAI
29
-
30
- DATA_DIR = Path("data")
31
- DATA_DIR.mkdir(exist_ok=True)
32
-
33
- PAJAIS_CATEGORIES = [
34
- "Information Systems Theory", "IS Strategy & Governance",
35
- "Digital Innovation", "Enterprise Systems",
36
- "AI & Intelligent Systems", "Big Data & Analytics",
37
- "Cybersecurity & Privacy", "Cloud Computing",
38
- "IS in Healthcare", "IS in Education",
39
- "E-Commerce & Digital Markets", "Social Media & Platforms",
40
- "Human-Computer Interaction", "IS Project Management",
41
- "IT Outsourcing", "Knowledge Management",
42
- "IS Development Methodologies", "Digital Transformation",
43
- "IS Ethics & Society", "IS in Developing Countries",
44
- "Mobile Computing", "IT Infrastructure",
45
- "IS Adoption & Diffusion", "IS Evaluation",
46
- "Organizational IS & Change",
47
- ]
48
-
49
- # ── lazy-loaded models β€” initialised once on first call ───────────────────────
50
- _SPECTER_TOKENIZER = None
51
- _SPECTER_MODEL_OBJ = None
52
-
53
-
54
- def _get_specter():
55
- global _SPECTER_TOKENIZER, _SPECTER_MODEL_OBJ
56
- return (
57
- (_SPECTER_TOKENIZER, _SPECTER_MODEL_OBJ)
58
- if (_SPECTER_TOKENIZER is not None and _SPECTER_MODEL_OBJ is not None)
59
- else _load_specter_fresh()
60
- )
61
-
62
-
63
- def _load_specter_fresh():
64
- global _SPECTER_TOKENIZER, _SPECTER_MODEL_OBJ
65
- from transformers import AutoTokenizer, AutoModel
66
- MODEL_ID = "allenai/specter2_base"
67
- print("Loading SPECTER2 (allenai/specter2_base) β€” one-time HuggingFace download, then cached...")
68
- _SPECTER_TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
69
- _SPECTER_MODEL_OBJ = AutoModel.from_pretrained(MODEL_ID)
70
- _SPECTER_MODEL_OBJ.eval()
71
- print("SPECTER2 loaded OK.")
72
- return _SPECTER_TOKENIZER, _SPECTER_MODEL_OBJ
73
-
74
-
75
- def _embed_specter(texts: list) -> np.ndarray:
76
- import torch
77
- tokenizer, model = _get_specter()
78
- BATCH = 8
79
- all_embs = []
80
- starts = list(range(0, len(texts), BATCH))
81
- for start in starts:
82
- batch = texts[start: start + BATCH]
83
- inputs = tokenizer(batch, padding=True, truncation=True,
84
- max_length=512, return_tensors="pt")
85
- with torch.no_grad():
86
- out = model(**inputs)
87
- emb = out.last_hidden_state[:, 0, :].numpy() # CLS token
88
- norms = np.linalg.norm(emb, axis=1, keepdims=True)
89
- all_embs.append(emb / np.maximum(norms, 1e-9)) # L2-normalise
90
- return np.vstack(all_embs)
91
-
92
-
93
- def _p2() -> dict:
94
- """All paths for V2 β€” saved under data/v2/ only, never touches data/abstract/ or data/title/."""
95
- d = DATA_DIR / "v2"
96
- d.mkdir(parents=True, exist_ok=True)
97
- return {
98
- "dir": d,
99
- "papers": d / "papers.json",
100
- "embeddings": d / "embeddings.npy",
101
- "umap_emb": d / "umap_emb.npy",
102
- "clusters": d / "clusters.json",
103
- "summaries": d / "summaries.json",
104
- "taxonomy": d / "taxonomy.json",
105
- "charts": d / "charts.json",
106
- "audit_csv": d / "cluster_audit.csv",
107
- "narrative": d / "narrative_v2.txt",
108
- "comparison": DATA_DIR / "comparison_v2.csv",
109
- }
110
-
111
-
112
- def _read_csv_robust(path) -> pd.DataFrame:
113
- raw = Path(path).read_bytes()
114
- for enc in ["utf-8", "utf-8-sig", "latin-1", "cp1252"]:
115
- decoded = raw.decode(enc, errors="replace")
116
- return pd.read_csv(io.StringIO(decoded))
117
- return pd.read_csv(path)
118
-
119
-
120
- def _call_llm_json(llm, prompt: str):
121
- response = llm.invoke([HumanMessage(content=prompt)])
122
- raw = response.content.strip()
123
- raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw
124
- return json.loads(raw)
125
-
126
-
127
- def _mode_label(labels: list) -> str:
128
- from collections import Counter
129
- return Counter(labels).most_common(1)[0][0]
130
-
131
-
132
- # =============================================================================
133
- # V2 TOOL 1 β€” load_and_embed_specter2
134
- # =============================================================================
135
- @tool
136
- def load_and_embed_specter2(csv_path: str = "data/uploaded.csv") -> str:
137
- """Load Scopus CSV, build one combined Title+Abstract text per paper, embed with SPECTER2.
138
- SPECTER2 (allenai/specter2_base) is a LOCAL HuggingFace model β€” NO API key needed.
139
- First call downloads ~440 MB and caches; subsequent calls are instant.
140
- Output saved to data/v2/ only β€” completely independent of Classic (v1) run.
141
- Args:
142
- csv_path: Path to uploaded Scopus CSV.
143
- """
144
- p = _p2()
145
- df = _read_csv_robust(csv_path)
146
-
147
- col_map = {c.strip().lower(): c for c in df.columns}
148
- title_col = col_map.get("title", next((c for c in df.columns if "title" in c.lower()), None))
149
- abstract_col = col_map.get("abstract", next((c for c in df.columns if "abstract" in c.lower()), None))
150
- doi_col = col_map.get("doi", next((c for c in df.columns if "doi" in c.lower()), None))
151
- year_col = col_map.get("year", next((c for c in df.columns if "year" in c.lower()), None))
152
- journal_col = next((c for c in df.columns if "source" in c.lower()), None)
153
-
154
- n = len(df)
155
- titles = list(df[title_col].fillna("") if title_col else [""] * n)
156
- abstracts = list(df[abstract_col].fillna("") if abstract_col else [""] * n)
157
- dois = list(df[doi_col].fillna("") if doi_col else [""] * n)
158
- years = list(df[year_col].fillna("") if year_col else [""] * n)
159
- journals = list(df[journal_col].fillna("") if journal_col else [""] * n)
160
-
161
- combined = ["{} {}".format(str(titles[i]).strip(), str(abstracts[i]).strip()).strip()
162
- for i in range(n)]
163
- valid_idx = [i for i, t in enumerate(combined) if len(t.split()) > 5]
164
-
165
- papers = [{
166
- "paper_idx": i,
167
- "title": titles[i],
168
- "abstract": abstracts[i],
169
- "doi": dois[i],
170
- "year": str(years[i]),
171
- "journal": str(journals[i]),
172
- "combined": combined[i],
173
- } for i in valid_idx]
174
-
175
- p["papers"].write_text(json.dumps(papers, indent=2, ensure_ascii=False))
176
-
177
- valid_texts = [combined[i] for i in valid_idx]
178
- print("Embedding {} papers with SPECTER2...".format(len(valid_texts)))
179
- embs = _embed_specter(valid_texts)
180
- np.save(p["embeddings"], embs)
181
-
182
- return json.dumps({
183
- "total_papers": n,
184
- "valid_papers": len(papers),
185
- "embedding_dim": int(embs.shape[1]),
186
- "note": "SPECTER2 embeddings saved to data/v2/. No API key needed.",
187
- })
188
-
189
-
190
- # =============================================================================
191
- # V2 TOOL 2 β€” cluster_with_umap_hdbscan
192
- # =============================================================================
193
- @tool
194
- def cluster_with_umap_hdbscan(
195
- umap_neighbors: int = 15,
196
- umap_min_dist: float = 0.05,
197
- hdbscan_min_cluster_size: int = 5,
198
- hdbscan_min_samples: int = 3,
199
- ) -> str:
200
- """Reduce SPECTER2 embeddings with UMAP (cosine) then cluster with HDBSCAN.
201
- Targets 15-30 clusters, each with 5-120 papers. Saves results + charts to data/v2/.
202
- Args:
203
- umap_neighbors: UMAP n_neighbors (default 15).
204
- umap_min_dist: UMAP min_dist (default 0.05 for tight clusters).
205
- hdbscan_min_cluster_size: Min papers per cluster (default 5).
206
- hdbscan_min_samples: HDBSCAN core-point threshold (default 3).
207
- """
208
- import umap as umap_mod
209
- import hdbscan as hdbscan_mod
210
-
211
- p = _p2()
212
- embs = np.load(p["embeddings"])
213
- papers = json.loads(p["papers"].read_text())
214
-
215
- print("UMAP fit (n_neighbors={}, min_dist={})...".format(umap_neighbors, umap_min_dist))
216
- reducer = umap_mod.UMAP(
217
- n_components=5, n_neighbors=umap_neighbors,
218
- min_dist=umap_min_dist, metric="cosine",
219
- random_state=42, verbose=False,
220
- )
221
- umap_embs = reducer.fit_transform(embs)
222
- np.save(p["umap_emb"], umap_embs)
223
-
224
- print("HDBSCAN fit (min_cluster_size={})...".format(hdbscan_min_cluster_size))
225
- clusterer = hdbscan_mod.HDBSCAN(
226
- min_cluster_size=hdbscan_min_cluster_size,
227
- min_samples=hdbscan_min_samples,
228
- metric="euclidean",
229
- cluster_selection_method="eom",
230
- prediction_data=True,
231
- )
232
- labels = clusterer.fit_predict(umap_embs)
233
- probs = clusterer.probabilities_
234
- unique = sorted(set(labels.tolist()) - {-1})
235
- print("Raw clusters: {}, noise: {}".format(len(unique), int((labels == -1).sum())))
236
-
237
- def build_cluster(enum_pair):
238
- seq_id, raw_cid = enum_pair
239
- mask = labels == raw_cid
240
- indices = [i for i, m in enumerate(mask.tolist()) if m]
241
- cpaps = [papers[i] for i in indices]
242
- cembs = embs[mask]
243
- cprobs = probs[mask].tolist()
244
- centroid = cembs.mean(axis=0)
245
- c_norm = centroid / max(float(np.linalg.norm(centroid)), 1e-9)
246
- norms = np.linalg.norm(cembs, axis=1, keepdims=True)
247
- sims = (cembs / np.maximum(norms, 1e-9) @ c_norm).tolist()
248
- top3 = sorted(range(len(sims)), key=lambda x: -sims[x])[:3]
249
- return {
250
- "cluster_id": seq_id + 1,
251
- "paper_count": int(mask.sum()),
252
- "papers": cpaps,
253
- "hdbscan_probs": cprobs,
254
- "centroid_sims": sims,
255
- "centroid": centroid.tolist(),
256
- "top3_paper_idx": top3,
257
- "top3_titles": [cpaps[i]["title"] for i in top3],
258
- "top3_abstracts": [cpaps[i]["abstract"][:200] for i in top3],
259
- }
260
-
261
- all_clusters = list(map(build_cluster, enumerate(unique)))
262
- valid = sorted([c for c in all_clusters if 5 <= c["paper_count"] <= 120],
263
- key=lambda c: -c["paper_count"])
264
- valid = [{**c, "cluster_id": i + 1} for i, c in enumerate(valid)]
265
- noise = int((labels == -1).sum())
266
-
267
- # 2-D UMAP for scatter chart
268
- r2d = umap_mod.UMAP(n_components=2, n_neighbors=umap_neighbors,
269
- min_dist=umap_min_dist, metric="cosine",
270
- random_state=42, verbose=False)
271
- umap_2d = r2d.fit_transform(embs)
272
- cdf = pd.DataFrame({
273
- "x": umap_2d[:, 0].tolist(), "y": umap_2d[:, 1].tolist(),
274
- "cluster": [str(lb) for lb in labels.tolist()],
275
- "title": [pp["title"][:50] for pp in papers],
276
- "prob": probs.tolist(),
277
- })
278
- fig_s = px.scatter(cdf, x="x", y="y", color="cluster",
279
- hover_data=["title", "prob"],
280
- title="UMAP+HDBSCAN β€” {} clusters, {} noise".format(len(valid), noise))
281
- fig_b = px.bar(
282
- x=["C{}".format(c["cluster_id"]) for c in valid],
283
- y=[c["paper_count"] for c in valid],
284
- title="Papers per Cluster",
285
- )
286
- charts = {
287
- "scatter": fig_s.to_html(full_html=False, include_plotlyjs="cdn"),
288
- "bar": fig_b.to_html(full_html=False, include_plotlyjs=False),
289
- }
290
- p["charts"].write_text(json.dumps(charts))
291
- p["clusters"].write_text(json.dumps(valid, indent=2, ensure_ascii=False))
292
-
293
- return json.dumps({
294
- "clusters_found": len(valid),
295
- "noise_papers": noise,
296
- "total_papers": len(papers),
297
- "cluster_sizes": [c["paper_count"] for c in valid],
298
- "within_15_30": 15 <= len(valid) <= 30,
299
- "note": "{} clusters (5-120 papers each). Ready for council-of-3 labeling.".format(len(valid)),
300
- })
301
-
302
-
303
  # =============================================================================
304
- # V2 TOOL 3 β€” label_clusters_council_of_3
305
  # =============================================================================
306
  @tool
307
  def label_clusters_council_of_3(batch_size: int = 5) -> str:
308
- """Label each cluster using a council of 3 Mistral-small LLM calls with distinct expert personas.
 
 
 
 
309
  Final label = mode (most common) of the 3 responses.
310
  Vote agreement = unanimous / majority / split.
311
  Saves enriched summaries + full audit CSV (one row per paper) to data/v2/.
 
 
 
 
 
312
  Args:
313
  batch_size: Clusters per LLM call (default 5).
314
  """
315
  import time
 
 
 
 
 
 
 
 
316
  p = _p2()
317
  clusters = json.loads(p["clusters"].read_text())
318
 
319
- PERSONAS = [
 
320
  {
321
- "name": "IS_THEORY",
322
- "instruction": (
323
- "You are an Information Systems theory expert with 20 years systematic "
324
- "literature review experience. Label clusters with precise academic IS "
325
- "terminology. Labels: 4-7 words, noun-phrase, IS-specific "
326
- "(e.g. 'Enterprise Resource Planning Adoption Barriers', "
327
- "'IS Governance Frameworks Healthcare')."
328
  ),
329
  },
330
  {
331
- "name": "DIGITAL_MGT",
332
- "instruction": (
333
- "You are a digital management and organisational behaviour scholar "
334
- "specialising in technology adoption and digital transformation. "
335
- "Labels: 4-7 words, strategic/managerial framing "
336
- "(e.g. 'Organisational Change Through Digital Platforms', "
337
- "'Strategic IT-Business Alignment Mechanisms')."
338
  ),
339
  },
340
  {
341
- "name": "COMP_SCI",
342
- "instruction": (
343
- "You are a computer science and AI researcher reviewing IS literature. "
344
- "Labels: 4-7 words, technically precise "
345
- "(e.g. 'Machine Learning Clinical Decision Support', "
346
- "'Cloud Infrastructure Scalability Patterns')."
347
  ),
348
  },
349
  ]
350
-
351
- llm = ChatMistralAI(model="mistral-small-latest", temperature=0.2)
352
-
353
- def make_prompt(instruction, batch):
354
- mini = [{"cluster_id": c["cluster_id"], "paper_count": c["paper_count"],
355
- "top3_titles": c["top3_titles"], "top3_abstracts": c["top3_abstracts"]}
356
- for c in batch]
 
 
 
 
 
 
357
  return (
358
- instruction + "\n\n"
359
- "Label each cluster. IDs in this batch: " + str([c["cluster_id"] for c in batch]) + "\n\n"
 
 
360
  "CLUSTERS:\n" + json.dumps(mini, indent=2) + "\n\n"
361
  "Return ONLY a raw JSON array β€” no markdown, no preamble.\n"
362
  "Each element: cluster_id (int), label (4-7 words), "
363
  "confidence (High/Medium/Low), reasoning (one sentence)."
364
  )
 
365
 
 
 
 
366
  persona_results = [{}, {}, {}]
367
- batch_starts = list(range(0, len(clusters), batch_size))
368
 
369
- for pi, persona in enumerate(PERSONAS):
 
 
370
  all_labels = []
 
 
 
371
  for bi, start in enumerate(batch_starts):
372
  batch = clusters[start: start + batch_size]
373
- result = _call_llm_json(llm, make_prompt(persona["instruction"], batch))
374
- all_labels.extend(result)
375
- _ = time.sleep(10) if bi < len(batch_starts) - 1 else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  for item in all_labels:
377
  cid = int(item.get("cluster_id", 0))
378
  persona_results[pi][cid] = item
379
- _ = time.sleep(15) if pi < len(PERSONAS) - 1 else None
380
 
 
 
 
 
 
 
381
  def enrich(cluster):
382
  cid = cluster["cluster_id"]
383
- raw_votes = [str(persona_results[pi].get(cid, {}).get("label", "")).strip()
384
- for pi in range(3)]
385
- votes = [v if v and v.lower() not in ("", "none", "null")
386
- else "Cluster {}".format(cid) for v in raw_votes]
387
- final = _mode_label(votes)
388
- agreement = ("unanimous" if len(set(votes)) == 1
389
- else "majority" if votes.count(final) >= 2
390
- else "split")
 
 
 
 
 
 
 
391
  return {
392
  **cluster,
393
  "label": final,
394
- "llm_vote_1_IS_THEORY": votes[0],
395
- "llm_vote_2_DIGITAL_MGT": votes[1],
396
- "llm_vote_3_COMP_SCI": votes[2],
397
  "confidence_1": persona_results[0].get(cid, {}).get("confidence", ""),
398
  "confidence_2": persona_results[1].get(cid, {}).get("confidence", ""),
399
  "confidence_3": persona_results[2].get(cid, {}).get("confidence", ""),
@@ -412,29 +170,29 @@ def label_clusters_council_of_3(batch_size: int = 5) -> str:
412
  cid = c["cluster_id"]
413
  for li, paper in enumerate(c["papers"]):
414
  rows.append({
415
- "cluster_id": cid,
416
- "final_label": c["label"],
417
- "vote_agreement": c["vote_agreement"],
418
- "llm1_IS_THEORY_label": c["llm_vote_1_IS_THEORY"],
419
- "llm2_DIGITAL_MGT_label": c["llm_vote_2_DIGITAL_MGT"],
420
- "llm3_COMP_SCI_label": c["llm_vote_3_COMP_SCI"],
421
- "llm1_confidence": c["confidence_1"],
422
- "llm2_confidence": c["confidence_2"],
423
- "llm3_confidence": c["confidence_3"],
424
- "llm1_reasoning": c["reasoning_1"],
425
- "llm2_reasoning": c["reasoning_2"],
426
- "llm3_reasoning": c["reasoning_3"],
427
- "paper_doi": paper.get("doi", ""),
428
- "paper_title": paper.get("title", ""),
429
- "paper_year": paper.get("year", ""),
430
- "paper_journal": paper.get("journal", ""),
431
- "abstract_preview": paper.get("abstract", "")[:300],
432
- "combined_preview": paper.get("combined", "")[:200],
433
- "centroid_cosine_sim": round(float(
434
  c["centroid_sims"][li] if li < len(c["centroid_sims"]) else 0.0), 4),
435
- "hdbscan_probability": round(float(
436
  c["hdbscan_probs"][li] if li < len(c["hdbscan_probs"]) else 0.0), 4),
437
- "is_top3_centroid": "YES" if li in c["top3_paper_idx"] else "no",
438
  })
439
 
440
  pd.DataFrame(rows).to_csv(p["audit_csv"], index=False, encoding="utf-8-sig")
@@ -447,96 +205,6 @@ def label_clusters_council_of_3(batch_size: int = 5) -> str:
447
  "majority": majority,
448
  "split": len(enriched) - unanimous - majority,
449
  "audit_csv_rows": len(rows),
450
- "note": "Audit CSV ready ({} rows, one per paper). Download from Download tab.".format(len(rows)),
451
- })
452
-
453
-
454
- # =============================================================================
455
- # V2 TOOL 4 β€” map_clusters_to_pajais_v2
456
- # =============================================================================
457
- @tool
458
- def map_clusters_to_pajais_v2() -> str:
459
- """Map v2 cluster labels to PAJAIS 25 IS research categories via Mistral LLM.
460
- Saves taxonomy to data/v2/taxonomy.json. Independent of v1 taxonomy.
461
- """
462
- import time
463
- p = _p2()
464
- summaries = json.loads(p["summaries"].read_text())
465
- llm = ChatMistralAI(model="mistral-small-latest", temperature=0.1)
466
-
467
- mini = [{"cluster_id": s["cluster_id"], "name": s["label"],
468
- "sample": s["top3_titles"][:2]} for s in summaries]
469
- BATCH = 10
470
- starts = list(range(0, len(mini), BATCH))
471
- results = []
472
-
473
- for bi, start in enumerate(starts):
474
- batch = mini[start: start + BATCH]
475
- prompt = (
476
- "Map each IS research cluster to the single most relevant PAJAIS category.\n\n"
477
- "CLUSTERS:\n" + json.dumps(batch, indent=2) + "\n\n"
478
- "PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n"
479
- "Return ONLY a raw JSON array. Each element: "
480
- "cluster_id (int), name (str), pajais_category (str), "
481
- "confidence (High/Medium/Low), rationale (one sentence). "
482
- "No markdown."
483
- )
484
- results.extend(_call_llm_json(llm, prompt))
485
- _ = time.sleep(10) if bi < len(starts) - 1 else None
486
-
487
- p["taxonomy"].write_text(json.dumps(results, indent=2, ensure_ascii=False))
488
- return json.dumps({"mapped_clusters": len(results),
489
- "note": "PAJAIS taxonomy saved to data/v2/taxonomy.json"})
490
-
491
-
492
- # =============================================================================
493
- # V2 TOOL 5 β€” export_v2_outputs
494
- # =============================================================================
495
- @tool
496
- def export_v2_outputs() -> str:
497
- """Generate final comparison_v2.csv and narrative_v2.txt for the SPECTER2 run.
498
- comparison_v2.csv: enriched audit CSV with PAJAIS column added.
499
- narrative_v2.txt: 500-word Section 7 academic discussion.
500
- Both saved to data/v2/ and data/comparison_v2.csv.
501
- """
502
- p = _p2()
503
- summaries = json.loads(p["summaries"].read_text())
504
- taxonomy = json.loads(p["taxonomy"].read_text())
505
- tax_map = {str(item.get("cluster_id", "")): item.get("pajais_category", "Unknown")
506
- for item in taxonomy}
507
-
508
- audit_df = pd.read_csv(p["audit_csv"], encoding="utf-8-sig")
509
- audit_df["pajais_category"] = [
510
- tax_map.get(str(int(float(str(row["cluster_id"])))), "Unknown")
511
- for _, row in audit_df.iterrows()
512
- ]
513
- out_path = p["comparison"]
514
- audit_df.to_csv(out_path, index=False, encoding="utf-8-sig")
515
-
516
- llm = ChatMistralAI(model="mistral-small-latest", temperature=0.4)
517
- cluster_summary = [{"cluster": s["cluster_id"], "label": s["label"],
518
- "papers": s["paper_count"], "agreement": s["vote_agreement"]}
519
- for s in summaries]
520
-
521
- prompt = (
522
- "Write Section 7 (Discussion and Thematic Synthesis) for a systematic "
523
- "IS literature review. ~500 words, formal academic prose.\n"
524
- "Method: SPECTER2 document embeddings + UMAP + HDBSCAN + council-of-3-LLMs labeling.\n"
525
- "Cover: (a) overview of clusters/themes, (b) dominant PAJAIS categories, "
526
- "(c) inter-cluster relationships, (d) implications for IS research, "
527
- "(e) methodological contribution vs traditional BERTopic, (f) limitations.\n\n"
528
- "CLUSTERS:\n" + json.dumps(cluster_summary, indent=2) + "\n\n"
529
- "PAJAIS MAPPING:\n" + json.dumps(taxonomy, indent=2) + "\n\n"
530
- "Continuous academic paragraphs only. No bullet points or headers."
531
- )
532
- response = llm.invoke([HumanMessage(content=prompt)])
533
- narrative = response.content
534
- p["narrative"].write_text(narrative, encoding="utf-8")
535
-
536
- return json.dumps({
537
- "comparison_csv_rows": len(audit_df),
538
- "comparison_csv_path": str(out_path),
539
- "narrative_words": len(narrative.split()),
540
- "narrative_path": str(p["narrative"]),
541
- "note": "comparison_v2.csv + narrative_v2.txt ready in Download tab.",
542
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # =============================================================================
2
+ # V2 TOOL 3 β€” label_clusters_council_of_3 (TRUE multi-LLM ensemble)
3
  # =============================================================================
4
  @tool
5
  def label_clusters_council_of_3(batch_size: int = 5) -> str:
6
+ """Label each cluster using a TRUE council of 3 DIFFERENT LLMs:
7
+ 1. Mistral (mistral-small-latest)
8
+ 2. OpenAI (gpt-4o-mini)
9
+ 3. Groq (llama3-70b-8192)
10
+ Each model receives the SAME prompt independently.
11
  Final label = mode (most common) of the 3 responses.
12
  Vote agreement = unanimous / majority / split.
13
  Saves enriched summaries + full audit CSV (one row per paper) to data/v2/.
14
+
15
+ API keys are read automatically from environment variables:
16
+ MISTRAL_API_KEY, OPENAI_API_KEY, GROQ_API_KEY
17
+ Set these in HuggingFace Space β†’ Settings β†’ Variables and Secrets.
18
+
19
  Args:
20
  batch_size: Clusters per LLM call (default 5).
21
  """
22
  import time
23
+ import os
24
+
25
+ # ── NEW: import all 3 LangChain integrations ──────────────────────────────
26
+ from langchain_mistralai import ChatMistralAI
27
+ from langchain_openai import ChatOpenAI
28
+ from langchain_groq import ChatGroq
29
+ # ─────────────────────────────────────────────────────────────────────────
30
+
31
  p = _p2()
32
  clusters = json.loads(p["clusters"].read_text())
33
 
34
+ # ── NEW: define 3 real LLMs (keys picked up from env automatically) ───────
35
+ COUNCIL = [
36
  {
37
+ "name": "MISTRAL",
38
+ "model": ChatMistralAI(
39
+ model="mistral-small-latest",
40
+ temperature=0.2,
41
+ # api_key read from MISTRAL_API_KEY env var automatically
 
 
42
  ),
43
  },
44
  {
45
+ "name": "OPENAI",
46
+ "model": ChatOpenAI(
47
+ model="gpt-4o-mini",
48
+ temperature=0.2,
49
+ # api_key read from OPENAI_API_KEY env var automatically
 
 
50
  ),
51
  },
52
  {
53
+ "name": "GROQ",
54
+ "model": ChatGroq(
55
+ model="llama3-70b-8192",
56
+ temperature=0.2,
57
+ # api_key read from GROQ_API_KEY env var automatically
 
58
  ),
59
  },
60
  ]
61
+ # ─────────────────────────────────────────────────────────────────────────
62
+
63
+ # ── UNCHANGED: single shared prompt builder (same prompt for all 3 LLMs) ──
64
+ def make_prompt(batch):
65
+ mini = [
66
+ {
67
+ "cluster_id": c["cluster_id"],
68
+ "paper_count": c["paper_count"],
69
+ "top3_titles": c["top3_titles"],
70
+ "top3_abstracts": c["top3_abstracts"],
71
+ }
72
+ for c in batch
73
+ ]
74
  return (
75
+ "You are an Information Systems research expert conducting a systematic "
76
+ "literature review. Label each cluster with a precise 4-7 word noun-phrase "
77
+ "that reflects its core IS research theme.\n\n"
78
+ "Cluster IDs in this batch: " + str([c["cluster_id"] for c in batch]) + "\n\n"
79
  "CLUSTERS:\n" + json.dumps(mini, indent=2) + "\n\n"
80
  "Return ONLY a raw JSON array β€” no markdown, no preamble.\n"
81
  "Each element: cluster_id (int), label (4-7 words), "
82
  "confidence (High/Medium/Low), reasoning (one sentence)."
83
  )
84
+ # ─────────────────────────────────────────────────────────────────────────
85
 
86
+ # ── NEW: run each LLM independently across all batches ───────────────────
87
+ # persona_results[i] = { cluster_id: {label, confidence, reasoning} }
88
+ # shape is identical to before so all downstream code is UNCHANGED
89
  persona_results = [{}, {}, {}]
90
+ batch_starts = list(range(0, len(clusters), batch_size))
91
 
92
+ for pi, member in enumerate(COUNCIL):
93
+ llm = member["model"]
94
+ llm_name = member["name"]
95
  all_labels = []
96
+
97
+ print(f"Council member {pi+1}/3 ({llm_name}) labeling {len(clusters)} clusters...")
98
+
99
  for bi, start in enumerate(batch_starts):
100
  batch = clusters[start: start + batch_size]
101
+ prompt = make_prompt(batch) # same prompt for every LLM
102
+
103
+ # ── NEW: per-model error handling so one failure doesn't kill all ─
104
+ try:
105
+ result = _call_llm_json(llm, prompt)
106
+ all_labels.extend(result)
107
+ except Exception as e:
108
+ print(f" WARNING: {llm_name} batch {bi} failed: {e}. Using fallback labels.")
109
+ for c in batch:
110
+ all_labels.append({
111
+ "cluster_id": c["cluster_id"],
112
+ "label": f"Cluster {c['cluster_id']} ({llm_name} error)",
113
+ "confidence": "Low",
114
+ "reasoning": f"Fallback β€” {llm_name} error: {str(e)[:80]}",
115
+ })
116
+ # ────────────────────────────────────────────���────────────────────
117
+
118
+ # small delay between batches to respect rate limits
119
+ if bi < len(batch_starts) - 1:
120
+ time.sleep(8)
121
+
122
  for item in all_labels:
123
  cid = int(item.get("cluster_id", 0))
124
  persona_results[pi][cid] = item
 
125
 
126
+ # delay between council members (Groq is fast, Mistral/OpenAI need breathing room)
127
+ if pi < len(COUNCIL) - 1:
128
+ time.sleep(10)
129
+ # ─────────────────────────────────────────────────────────────────────────
130
+
131
+ # ── UNCHANGED from here down: voting + enrichment + CSV export ───────────
132
  def enrich(cluster):
133
  cid = cluster["cluster_id"]
134
+ raw_votes = [
135
+ str(persona_results[pi].get(cid, {}).get("label", "")).strip()
136
+ for pi in range(3)
137
+ ]
138
+ votes = [
139
+ v if v and v.lower() not in ("", "none", "null")
140
+ else "Cluster {}".format(cid)
141
+ for v in raw_votes
142
+ ]
143
+ final = _mode_label(votes)
144
+ agreement = (
145
+ "unanimous" if len(set(votes)) == 1
146
+ else "majority" if votes.count(final) >= 2
147
+ else "split"
148
+ )
149
  return {
150
  **cluster,
151
  "label": final,
152
+ "llm_vote_1_MISTRAL": votes[0], # key renamed to match real model
153
+ "llm_vote_2_OPENAI": votes[1], # key renamed to match real model
154
+ "llm_vote_3_GROQ": votes[2], # key renamed to match real model
155
  "confidence_1": persona_results[0].get(cid, {}).get("confidence", ""),
156
  "confidence_2": persona_results[1].get(cid, {}).get("confidence", ""),
157
  "confidence_3": persona_results[2].get(cid, {}).get("confidence", ""),
 
170
  cid = c["cluster_id"]
171
  for li, paper in enumerate(c["papers"]):
172
  rows.append({
173
+ "cluster_id": cid,
174
+ "final_label": c["label"],
175
+ "vote_agreement": c["vote_agreement"],
176
+ "llm1_MISTRAL_label": c["llm_vote_1_MISTRAL"], # renamed
177
+ "llm2_OPENAI_label": c["llm_vote_2_OPENAI"], # renamed
178
+ "llm3_GROQ_label": c["llm_vote_3_GROQ"], # renamed
179
+ "llm1_confidence": c["confidence_1"],
180
+ "llm2_confidence": c["confidence_2"],
181
+ "llm3_confidence": c["confidence_3"],
182
+ "llm1_reasoning": c["reasoning_1"],
183
+ "llm2_reasoning": c["reasoning_2"],
184
+ "llm3_reasoning": c["reasoning_3"],
185
+ "paper_doi": paper.get("doi", ""),
186
+ "paper_title": paper.get("title", ""),
187
+ "paper_year": paper.get("year", ""),
188
+ "paper_journal": paper.get("journal", ""),
189
+ "abstract_preview": paper.get("abstract", "")[:300],
190
+ "combined_preview": paper.get("combined", "")[:200],
191
+ "centroid_cosine_sim": round(float(
192
  c["centroid_sims"][li] if li < len(c["centroid_sims"]) else 0.0), 4),
193
+ "hdbscan_probability": round(float(
194
  c["hdbscan_probs"][li] if li < len(c["hdbscan_probs"]) else 0.0), 4),
195
+ "is_top3_centroid": "YES" if li in c["top3_paper_idx"] else "no",
196
  })
197
 
198
  pd.DataFrame(rows).to_csv(p["audit_csv"], index=False, encoding="utf-8-sig")
 
205
  "majority": majority,
206
  "split": len(enriched) - unanimous - majority,
207
  "audit_csv_rows": len(rows),
208
+ "council_members": [m["name"] for m in COUNCIL], # NEW: visible in output
209
+ "note": "True 3-LLM ensemble (Mistral+OpenAI+Groq). Audit CSV ready ({} rows).".format(len(rows)),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  })