Spaces:

aadisawant2912
/

topic_modelling

Sleeping

App Files Files Community

aadisawant2912 commited on about 1 month ago

Commit

f083a2e

verified ·

1 Parent(s): a19ce45

Update tools_v2.py

Browse files

Files changed (1) hide show

tools_v2.py +132 -464

tools_v2.py CHANGED Viewed

@@ -1,399 +1,157 @@
-"""
-tools_v2.py - SPECTER2 + HDBSCAN + UMAP thematic analysis tools.
-COMPLETELY INDEPENDENT from tools.py (v1). No shared state, no ordering dependency.
-V2 can be run before, after, or without ever running V1.
-SPECTER2 is allenai/specter2_base — a local HuggingFace model.
-NO API KEY required. Downloads once, cached automatically.
-Pipeline:
-  1. Combined Title+Abstract per paper → SPECTER2 embedding (768-dim)
-  2. UMAP (cosine, 5D) → tight document clusters
-  3. HDBSCAN → 15-30 clusters, 5-120 papers each
-  4. Council-of-3-LLMs → 3 Mistral-small expert personas → mode vote
-  5. PAJAIS mapping + audit CSV + narrative
-"""
-from __future__ import annotations
-import json
-import io
-from pathlib import Path
-import numpy as np
-import pandas as pd
-import plotly.express as px
-from langchain_core.tools import tool
-from langchain_core.messages import HumanMessage
-from langchain_mistralai import ChatMistralAI
-DATA_DIR = Path("data")
-DATA_DIR.mkdir(exist_ok=True)
-PAJAIS_CATEGORIES = [
-    "Information Systems Theory",    "IS Strategy & Governance",
-    "Digital Innovation",            "Enterprise Systems",
-    "AI & Intelligent Systems",      "Big Data & Analytics",
-    "Cybersecurity & Privacy",       "Cloud Computing",
-    "IS in Healthcare",              "IS in Education",
-    "E-Commerce & Digital Markets",  "Social Media & Platforms",
-    "Human-Computer Interaction",    "IS Project Management",
-    "IT Outsourcing",                "Knowledge Management",
-    "IS Development Methodologies",  "Digital Transformation",
-    "IS Ethics & Society",           "IS in Developing Countries",
-    "Mobile Computing",              "IT Infrastructure",
-    "IS Adoption & Diffusion",       "IS Evaluation",
-    "Organizational IS & Change",
-]
-# ── lazy-loaded models — initialised once on first call ───────────────────────
-_SPECTER_TOKENIZER = None
-_SPECTER_MODEL_OBJ = None
-def _get_specter():
-    global _SPECTER_TOKENIZER, _SPECTER_MODEL_OBJ
-    return (
-        (_SPECTER_TOKENIZER, _SPECTER_MODEL_OBJ)
-        if (_SPECTER_TOKENIZER is not None and _SPECTER_MODEL_OBJ is not None)
-        else _load_specter_fresh()
-    )
-def _load_specter_fresh():
-    global _SPECTER_TOKENIZER, _SPECTER_MODEL_OBJ
-    from transformers import AutoTokenizer, AutoModel
-    MODEL_ID = "allenai/specter2_base"
-    print("Loading SPECTER2 (allenai/specter2_base) — one-time HuggingFace download, then cached...")
-    _SPECTER_TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
-    _SPECTER_MODEL_OBJ = AutoModel.from_pretrained(MODEL_ID)
-    _SPECTER_MODEL_OBJ.eval()
-    print("SPECTER2 loaded OK.")
-    return _SPECTER_TOKENIZER, _SPECTER_MODEL_OBJ
-def _embed_specter(texts: list) -> np.ndarray:
-    import torch
-    tokenizer, model = _get_specter()
-    BATCH     = 8
-    all_embs  = []
-    starts    = list(range(0, len(texts), BATCH))
-    for start in starts:
-        batch  = texts[start: start + BATCH]
-        inputs = tokenizer(batch, padding=True, truncation=True,
-                           max_length=512, return_tensors="pt")
-        with torch.no_grad():
-            out = model(**inputs)
-        emb   = out.last_hidden_state[:, 0, :].numpy()          # CLS token
-        norms = np.linalg.norm(emb, axis=1, keepdims=True)
-        all_embs.append(emb / np.maximum(norms, 1e-9))           # L2-normalise
-    return np.vstack(all_embs)
-def _p2() -> dict:
-    """All paths for V2 — saved under data/v2/ only, never touches data/abstract/ or data/title/."""
-    d = DATA_DIR / "v2"
-    d.mkdir(parents=True, exist_ok=True)
-    return {
-        "dir":        d,
-        "papers":     d / "papers.json",
-        "embeddings": d / "embeddings.npy",
-        "umap_emb":   d / "umap_emb.npy",
-        "clusters":   d / "clusters.json",
-        "summaries":  d / "summaries.json",
-        "taxonomy":   d / "taxonomy.json",
-        "charts":     d / "charts.json",
-        "audit_csv":  d / "cluster_audit.csv",
-        "narrative":  d / "narrative_v2.txt",
-        "comparison": DATA_DIR / "comparison_v2.csv",
-    }
-def _read_csv_robust(path) -> pd.DataFrame:
-    raw = Path(path).read_bytes()
-    for enc in ["utf-8", "utf-8-sig", "latin-1", "cp1252"]:
-        decoded = raw.decode(enc, errors="replace")
-        return pd.read_csv(io.StringIO(decoded))
-    return pd.read_csv(path)
-def _call_llm_json(llm, prompt: str):
-    response = llm.invoke([HumanMessage(content=prompt)])
-    raw = response.content.strip()
-    raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw
-    return json.loads(raw)
-def _mode_label(labels: list) -> str:
-    from collections import Counter
-    return Counter(labels).most_common(1)[0][0]
-# =============================================================================
-# V2 TOOL 1 — load_and_embed_specter2
-# =============================================================================
-@tool
-def load_and_embed_specter2(csv_path: str = "data/uploaded.csv") -> str:
-    """Load Scopus CSV, build one combined Title+Abstract text per paper, embed with SPECTER2.
-    SPECTER2 (allenai/specter2_base) is a LOCAL HuggingFace model — NO API key needed.
-    First call downloads ~440 MB and caches; subsequent calls are instant.
-    Output saved to data/v2/ only — completely independent of Classic (v1) run.
-    Args:
-        csv_path: Path to uploaded Scopus CSV.
-    """
-    p  = _p2()
-    df = _read_csv_robust(csv_path)
-    col_map      = {c.strip().lower(): c for c in df.columns}
-    title_col    = col_map.get("title",    next((c for c in df.columns if "title"    in c.lower()), None))
-    abstract_col = col_map.get("abstract", next((c for c in df.columns if "abstract" in c.lower()), None))
-    doi_col      = col_map.get("doi",      next((c for c in df.columns if "doi"      in c.lower()), None))
-    year_col     = col_map.get("year",     next((c for c in df.columns if "year"     in c.lower()), None))
-    journal_col  = next((c for c in df.columns if "source" in c.lower()), None)
-    n = len(df)
-    titles    = list(df[title_col].fillna("")    if title_col    else [""] * n)
-    abstracts = list(df[abstract_col].fillna("") if abstract_col else [""] * n)
-    dois      = list(df[doi_col].fillna("")      if doi_col      else [""] * n)
-    years     = list(df[year_col].fillna("")     if year_col     else [""] * n)
-    journals  = list(df[journal_col].fillna("")  if journal_col  else [""] * n)
-    combined = ["{} {}".format(str(titles[i]).strip(), str(abstracts[i]).strip()).strip()
-                for i in range(n)]
-    valid_idx = [i for i, t in enumerate(combined) if len(t.split()) > 5]
-    papers = [{
-        "paper_idx": i,
-        "title":     titles[i],
-        "abstract":  abstracts[i],
-        "doi":       dois[i],
-        "year":      str(years[i]),
-        "journal":   str(journals[i]),
-        "combined":  combined[i],
-    } for i in valid_idx]
-    p["papers"].write_text(json.dumps(papers, indent=2, ensure_ascii=False))
-    valid_texts = [combined[i] for i in valid_idx]
-    print("Embedding {} papers with SPECTER2...".format(len(valid_texts)))
-    embs = _embed_specter(valid_texts)
-    np.save(p["embeddings"], embs)
-    return json.dumps({
-        "total_papers":  n,
-        "valid_papers":  len(papers),
-        "embedding_dim": int(embs.shape[1]),
-        "note": "SPECTER2 embeddings saved to data/v2/. No API key needed.",
-    })
-# =============================================================================
-# V2 TOOL 2 — cluster_with_umap_hdbscan
-# =============================================================================
-@tool
-def cluster_with_umap_hdbscan(
-    umap_neighbors: int = 15,
-    umap_min_dist: float = 0.05,
-    hdbscan_min_cluster_size: int = 5,
-    hdbscan_min_samples: int = 3,
-) -> str:
-    """Reduce SPECTER2 embeddings with UMAP (cosine) then cluster with HDBSCAN.
-    Targets 15-30 clusters, each with 5-120 papers. Saves results + charts to data/v2/.
-    Args:
-        umap_neighbors:           UMAP n_neighbors (default 15).
-        umap_min_dist:            UMAP min_dist (default 0.05 for tight clusters).
-        hdbscan_min_cluster_size: Min papers per cluster (default 5).
-        hdbscan_min_samples:      HDBSCAN core-point threshold (default 3).
-    """
-    import umap as umap_mod
-    import hdbscan as hdbscan_mod
-    p      = _p2()
-    embs   = np.load(p["embeddings"])
-    papers = json.loads(p["papers"].read_text())
-    print("UMAP fit (n_neighbors={}, min_dist={})...".format(umap_neighbors, umap_min_dist))
-    reducer = umap_mod.UMAP(
-        n_components=5, n_neighbors=umap_neighbors,
-        min_dist=umap_min_dist, metric="cosine",
-        random_state=42, verbose=False,
-    )
-    umap_embs = reducer.fit_transform(embs)
-    np.save(p["umap_emb"], umap_embs)
-    print("HDBSCAN fit (min_cluster_size={})...".format(hdbscan_min_cluster_size))
-    clusterer = hdbscan_mod.HDBSCAN(
-        min_cluster_size=hdbscan_min_cluster_size,
-        min_samples=hdbscan_min_samples,
-        metric="euclidean",
-        cluster_selection_method="eom",
-        prediction_data=True,
-    )
-    labels = clusterer.fit_predict(umap_embs)
-    probs  = clusterer.probabilities_
-    unique = sorted(set(labels.tolist()) - {-1})
-    print("Raw clusters: {}, noise: {}".format(len(unique), int((labels == -1).sum())))
-    def build_cluster(enum_pair):
-        seq_id, raw_cid = enum_pair
-        mask    = labels == raw_cid
-        indices = [i for i, m in enumerate(mask.tolist()) if m]
-        cpaps   = [papers[i] for i in indices]
-        cembs   = embs[mask]
-        cprobs  = probs[mask].tolist()
-        centroid = cembs.mean(axis=0)
-        c_norm   = centroid / max(float(np.linalg.norm(centroid)), 1e-9)
-        norms    = np.linalg.norm(cembs, axis=1, keepdims=True)
-        sims     = (cembs / np.maximum(norms, 1e-9) @ c_norm).tolist()
-        top3     = sorted(range(len(sims)), key=lambda x: -sims[x])[:3]
-        return {
-            "cluster_id":     seq_id + 1,
-            "paper_count":    int(mask.sum()),
-            "papers":         cpaps,
-            "hdbscan_probs":  cprobs,
-            "centroid_sims":  sims,
-            "centroid":       centroid.tolist(),
-            "top3_paper_idx": top3,
-            "top3_titles":    [cpaps[i]["title"]           for i in top3],
-            "top3_abstracts": [cpaps[i]["abstract"][:200]  for i in top3],
-        }
-    all_clusters = list(map(build_cluster, enumerate(unique)))
-    valid = sorted([c for c in all_clusters if 5 <= c["paper_count"] <= 120],
-                   key=lambda c: -c["paper_count"])
-    valid = [{**c, "cluster_id": i + 1} for i, c in enumerate(valid)]
-    noise = int((labels == -1).sum())
-    # 2-D UMAP for scatter chart
-    r2d = umap_mod.UMAP(n_components=2, n_neighbors=umap_neighbors,
-                        min_dist=umap_min_dist, metric="cosine",
-                        random_state=42, verbose=False)
-    umap_2d = r2d.fit_transform(embs)
-    cdf = pd.DataFrame({
-        "x":       umap_2d[:, 0].tolist(), "y": umap_2d[:, 1].tolist(),
-        "cluster": [str(lb) for lb in labels.tolist()],
-        "title":   [pp["title"][:50] for pp in papers],
-        "prob":    probs.tolist(),
-    })
-    fig_s = px.scatter(cdf, x="x", y="y", color="cluster",
-                       hover_data=["title", "prob"],
-                       title="UMAP+HDBSCAN — {} clusters, {} noise".format(len(valid), noise))
-    fig_b = px.bar(
-        x=["C{}".format(c["cluster_id"]) for c in valid],
-        y=[c["paper_count"]              for c in valid],
-        title="Papers per Cluster",
-    )
-    charts = {
-        "scatter": fig_s.to_html(full_html=False, include_plotlyjs="cdn"),
-        "bar":     fig_b.to_html(full_html=False, include_plotlyjs=False),
-    }
-    p["charts"].write_text(json.dumps(charts))
-    p["clusters"].write_text(json.dumps(valid, indent=2, ensure_ascii=False))
-    return json.dumps({
-        "clusters_found": len(valid),
-        "noise_papers":   noise,
-        "total_papers":   len(papers),
-        "cluster_sizes":  [c["paper_count"] for c in valid],
-        "within_15_30":   15 <= len(valid) <= 30,
-        "note": "{} clusters (5-120 papers each). Ready for council-of-3 labeling.".format(len(valid)),
-    })
 # =============================================================================
-# V2 TOOL 3 — label_clusters_council_of_3
 # =============================================================================
 @tool
 def label_clusters_council_of_3(batch_size: int = 5) -> str:
-    """Label each cluster using a council of 3 Mistral-small LLM calls with distinct expert personas.
     Final label = mode (most common) of the 3 responses.
     Vote agreement = unanimous / majority / split.
     Saves enriched summaries + full audit CSV (one row per paper) to data/v2/.
     Args:
         batch_size: Clusters per LLM call (default 5).
     """
     import time
     p        = _p2()
     clusters = json.loads(p["clusters"].read_text())
-    PERSONAS = [
         {
-            "name": "IS_THEORY",
-            "instruction": (
-                "You are an Information Systems theory expert with 20 years systematic "
-                "literature review experience. Label clusters with precise academic IS "
-                "terminology. Labels: 4-7 words, noun-phrase, IS-specific "
-                "(e.g. 'Enterprise Resource Planning Adoption Barriers', "
-                "'IS Governance Frameworks Healthcare')."
             ),
         },
         {
-            "name": "DIGITAL_MGT",
-            "instruction": (
-                "You are a digital management and organisational behaviour scholar "
-                "specialising in technology adoption and digital transformation. "
-                "Labels: 4-7 words, strategic/managerial framing "
-                "(e.g. 'Organisational Change Through Digital Platforms', "
-                "'Strategic IT-Business Alignment Mechanisms')."
             ),
         },
         {
-            "name": "COMP_SCI",
-            "instruction": (
-                "You are a computer science and AI researcher reviewing IS literature. "
-                "Labels: 4-7 words, technically precise "
-                "(e.g. 'Machine Learning Clinical Decision Support', "
-                "'Cloud Infrastructure Scalability Patterns')."
             ),
         },
     ]
-    llm = ChatMistralAI(model="mistral-small-latest", temperature=0.2)
-    def make_prompt(instruction, batch):
-        mini = [{"cluster_id": c["cluster_id"], "paper_count": c["paper_count"],
-                 "top3_titles": c["top3_titles"], "top3_abstracts": c["top3_abstracts"]}
-                for c in batch]
         return (
-            instruction + "\n\n"
-            "Label each cluster. IDs in this batch: " + str([c["cluster_id"] for c in batch]) + "\n\n"
             "CLUSTERS:\n" + json.dumps(mini, indent=2) + "\n\n"
             "Return ONLY a raw JSON array — no markdown, no preamble.\n"
             "Each element: cluster_id (int), label (4-7 words), "
             "confidence (High/Medium/Low), reasoning (one sentence)."
         )
     persona_results = [{}, {}, {}]
-    batch_starts = list(range(0, len(clusters), batch_size))
-    for pi, persona in enumerate(PERSONAS):
         all_labels = []
         for bi, start in enumerate(batch_starts):
             batch  = clusters[start: start + batch_size]
-            result = _call_llm_json(llm, make_prompt(persona["instruction"], batch))
-            all_labels.extend(result)
-            _ = time.sleep(10) if bi < len(batch_starts) - 1 else None
         for item in all_labels:
             cid = int(item.get("cluster_id", 0))
             persona_results[pi][cid] = item
-        _ = time.sleep(15) if pi < len(PERSONAS) - 1 else None
     def enrich(cluster):
         cid = cluster["cluster_id"]
-        raw_votes = [str(persona_results[pi].get(cid, {}).get("label", "")).strip()
-                     for pi in range(3)]
-        votes = [v if v and v.lower() not in ("", "none", "null")
-                 else "Cluster {}".format(cid) for v in raw_votes]
-        final = _mode_label(votes)
-        agreement = ("unanimous" if len(set(votes)) == 1
-                     else "majority" if votes.count(final) >= 2
-                     else "split")
         return {
             **cluster,
             "label":                  final,
-            "llm_vote_1_IS_THEORY":   votes[0],
-            "llm_vote_2_DIGITAL_MGT": votes[1],
-            "llm_vote_3_COMP_SCI":    votes[2],
             "confidence_1": persona_results[0].get(cid, {}).get("confidence", ""),
             "confidence_2": persona_results[1].get(cid, {}).get("confidence", ""),
             "confidence_3": persona_results[2].get(cid, {}).get("confidence", ""),
@@ -412,29 +170,29 @@ def label_clusters_council_of_3(batch_size: int = 5) -> str:
         cid = c["cluster_id"]
         for li, paper in enumerate(c["papers"]):
             rows.append({
-                "cluster_id":              cid,
-                "final_label":             c["label"],
-                "vote_agreement":          c["vote_agreement"],
-                "llm1_IS_THEORY_label":    c["llm_vote_1_IS_THEORY"],
-                "llm2_DIGITAL_MGT_label":  c["llm_vote_2_DIGITAL_MGT"],
-                "llm3_COMP_SCI_label":     c["llm_vote_3_COMP_SCI"],
-                "llm1_confidence":         c["confidence_1"],
-                "llm2_confidence":         c["confidence_2"],
-                "llm3_confidence":         c["confidence_3"],
-                "llm1_reasoning":          c["reasoning_1"],
-                "llm2_reasoning":          c["reasoning_2"],
-                "llm3_reasoning":          c["reasoning_3"],
-                "paper_doi":               paper.get("doi", ""),
-                "paper_title":             paper.get("title", ""),
-                "paper_year":              paper.get("year", ""),
-                "paper_journal":           paper.get("journal", ""),
-                "abstract_preview":        paper.get("abstract", "")[:300],
-                "combined_preview":        paper.get("combined", "")[:200],
-                "centroid_cosine_sim":     round(float(
                     c["centroid_sims"][li] if li < len(c["centroid_sims"]) else 0.0), 4),
-                "hdbscan_probability":     round(float(
                     c["hdbscan_probs"][li] if li < len(c["hdbscan_probs"]) else 0.0), 4),
-                "is_top3_centroid":        "YES" if li in c["top3_paper_idx"] else "no",
             })
     pd.DataFrame(rows).to_csv(p["audit_csv"], index=False, encoding="utf-8-sig")
@@ -447,96 +205,6 @@ def label_clusters_council_of_3(batch_size: int = 5) -> str:
         "majority":         majority,
         "split":            len(enriched) - unanimous - majority,
         "audit_csv_rows":   len(rows),
-        "note": "Audit CSV ready ({} rows, one per paper). Download from Download tab.".format(len(rows)),
-    })
-# =============================================================================
-# V2 TOOL 4 — map_clusters_to_pajais_v2
-# =============================================================================
-@tool
-def map_clusters_to_pajais_v2() -> str:
-    """Map v2 cluster labels to PAJAIS 25 IS research categories via Mistral LLM.
-    Saves taxonomy to data/v2/taxonomy.json. Independent of v1 taxonomy.
-    """
-    import time
-    p         = _p2()
-    summaries = json.loads(p["summaries"].read_text())
-    llm       = ChatMistralAI(model="mistral-small-latest", temperature=0.1)
-    mini = [{"cluster_id": s["cluster_id"], "name": s["label"],
-             "sample": s["top3_titles"][:2]} for s in summaries]
-    BATCH = 10
-    starts = list(range(0, len(mini), BATCH))
-    results = []
-    for bi, start in enumerate(starts):
-        batch  = mini[start: start + BATCH]
-        prompt = (
-            "Map each IS research cluster to the single most relevant PAJAIS category.\n\n"
-            "CLUSTERS:\n" + json.dumps(batch, indent=2) + "\n\n"
-            "PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n"
-            "Return ONLY a raw JSON array. Each element: "
-            "cluster_id (int), name (str), pajais_category (str), "
-            "confidence (High/Medium/Low), rationale (one sentence). "
-            "No markdown."
-        )
-        results.extend(_call_llm_json(llm, prompt))
-        _ = time.sleep(10) if bi < len(starts) - 1 else None
-    p["taxonomy"].write_text(json.dumps(results, indent=2, ensure_ascii=False))
-    return json.dumps({"mapped_clusters": len(results),
-                       "note": "PAJAIS taxonomy saved to data/v2/taxonomy.json"})
-# =============================================================================
-# V2 TOOL 5 — export_v2_outputs
-# =============================================================================
-@tool
-def export_v2_outputs() -> str:
-    """Generate final comparison_v2.csv and narrative_v2.txt for the SPECTER2 run.
-    comparison_v2.csv: enriched audit CSV with PAJAIS column added.
-    narrative_v2.txt: 500-word Section 7 academic discussion.
-    Both saved to data/v2/ and data/comparison_v2.csv.
-    """
-    p         = _p2()
-    summaries = json.loads(p["summaries"].read_text())
-    taxonomy  = json.loads(p["taxonomy"].read_text())
-    tax_map   = {str(item.get("cluster_id", "")): item.get("pajais_category", "Unknown")
-                 for item in taxonomy}
-    audit_df = pd.read_csv(p["audit_csv"], encoding="utf-8-sig")
-    audit_df["pajais_category"] = [
-        tax_map.get(str(int(float(str(row["cluster_id"])))), "Unknown")
-        for _, row in audit_df.iterrows()
-    ]
-    out_path = p["comparison"]
-    audit_df.to_csv(out_path, index=False, encoding="utf-8-sig")
-    llm = ChatMistralAI(model="mistral-small-latest", temperature=0.4)
-    cluster_summary = [{"cluster": s["cluster_id"], "label": s["label"],
-                        "papers": s["paper_count"], "agreement": s["vote_agreement"]}
-                       for s in summaries]
-    prompt = (
-        "Write Section 7 (Discussion and Thematic Synthesis) for a systematic "
-        "IS literature review. ~500 words, formal academic prose.\n"
-        "Method: SPECTER2 document embeddings + UMAP + HDBSCAN + council-of-3-LLMs labeling.\n"
-        "Cover: (a) overview of clusters/themes, (b) dominant PAJAIS categories, "
-        "(c) inter-cluster relationships, (d) implications for IS research, "
-        "(e) methodological contribution vs traditional BERTopic, (f) limitations.\n\n"
-        "CLUSTERS:\n" + json.dumps(cluster_summary, indent=2) + "\n\n"
-        "PAJAIS MAPPING:\n" + json.dumps(taxonomy, indent=2) + "\n\n"
-        "Continuous academic paragraphs only. No bullet points or headers."
-    )
-    response  = llm.invoke([HumanMessage(content=prompt)])
-    narrative = response.content
-    p["narrative"].write_text(narrative, encoding="utf-8")
-    return json.dumps({
-        "comparison_csv_rows": len(audit_df),
-        "comparison_csv_path": str(out_path),
-        "narrative_words":     len(narrative.split()),
-        "narrative_path":      str(p["narrative"]),
-        "note": "comparison_v2.csv + narrative_v2.txt ready in Download tab.",
     })

 # =============================================================================
+# V2 TOOL 3 — label_clusters_council_of_3  (TRUE multi-LLM ensemble)
 # =============================================================================
 @tool
 def label_clusters_council_of_3(batch_size: int = 5) -> str:
+    """Label each cluster using a TRUE council of 3 DIFFERENT LLMs:
+      1. Mistral (mistral-small-latest)
+      2. OpenAI  (gpt-4o-mini)
+      3. Groq    (llama3-70b-8192)
+    Each model receives the SAME prompt independently.
     Final label = mode (most common) of the 3 responses.
     Vote agreement = unanimous / majority / split.
     Saves enriched summaries + full audit CSV (one row per paper) to data/v2/.
+    API keys are read automatically from environment variables:
+      MISTRAL_API_KEY, OPENAI_API_KEY, GROQ_API_KEY
+    Set these in HuggingFace Space → Settings → Variables and Secrets.
     Args:
         batch_size: Clusters per LLM call (default 5).
     """
     import time
+    import os
+    # ── NEW: import all 3 LangChain integrations ──────────────────────────────
+    from langchain_mistralai import ChatMistralAI
+    from langchain_openai import ChatOpenAI
+    from langchain_groq import ChatGroq
+    # ─────────────────────────────────────────────────────────────────────────
     p        = _p2()
     clusters = json.loads(p["clusters"].read_text())
+    # ── NEW: define 3 real LLMs (keys picked up from env automatically) ───────
+    COUNCIL = [
         {
+            "name":  "MISTRAL",
+            "model": ChatMistralAI(
+                model="mistral-small-latest",
+                temperature=0.2,
+                # api_key read from MISTRAL_API_KEY env var automatically
             ),
         },
         {
+            "name":  "OPENAI",
+            "model": ChatOpenAI(
+                model="gpt-4o-mini",
+                temperature=0.2,
+                # api_key read from OPENAI_API_KEY env var automatically
             ),
         },
         {
+            "name":  "GROQ",
+            "model": ChatGroq(
+                model="llama3-70b-8192",
+                temperature=0.2,
+                # api_key read from GROQ_API_KEY env var automatically
             ),
         },
     ]
+    # ─────────────────────────────────────────────────────────────────────────
+    # ── UNCHANGED: single shared prompt builder (same prompt for all 3 LLMs) ──
+    def make_prompt(batch):
+        mini = [
+            {
+                "cluster_id":     c["cluster_id"],
+                "paper_count":    c["paper_count"],
+                "top3_titles":    c["top3_titles"],
+                "top3_abstracts": c["top3_abstracts"],
+            }
+            for c in batch
+        ]
         return (
+            "You are an Information Systems research expert conducting a systematic "
+            "literature review. Label each cluster with a precise 4-7 word noun-phrase "
+            "that reflects its core IS research theme.\n\n"
+            "Cluster IDs in this batch: " + str([c["cluster_id"] for c in batch]) + "\n\n"
             "CLUSTERS:\n" + json.dumps(mini, indent=2) + "\n\n"
             "Return ONLY a raw JSON array — no markdown, no preamble.\n"
             "Each element: cluster_id (int), label (4-7 words), "
             "confidence (High/Medium/Low), reasoning (one sentence)."
         )
+    # ─────────────────────────────────────────────────────────────────────────
+    # ── NEW: run each LLM independently across all batches ───────────────────
+    # persona_results[i] = { cluster_id: {label, confidence, reasoning} }
+    # shape is identical to before so all downstream code is UNCHANGED
     persona_results = [{}, {}, {}]
+    batch_starts    = list(range(0, len(clusters), batch_size))
+    for pi, member in enumerate(COUNCIL):
+        llm        = member["model"]
+        llm_name   = member["name"]
         all_labels = []
+        print(f"Council member {pi+1}/3 ({llm_name}) labeling {len(clusters)} clusters...")
         for bi, start in enumerate(batch_starts):
             batch  = clusters[start: start + batch_size]
+            prompt = make_prompt(batch)          # same prompt for every LLM
+            # ── NEW: per-model error handling so one failure doesn't kill all ─
+            try:
+                result = _call_llm_json(llm, prompt)
+                all_labels.extend(result)
+            except Exception as e:
+                print(f"  WARNING: {llm_name} batch {bi} failed: {e}. Using fallback labels.")
+                for c in batch:
+                    all_labels.append({
+                        "cluster_id": c["cluster_id"],
+                        "label":      f"Cluster {c['cluster_id']} ({llm_name} error)",
+                        "confidence": "Low",
+                        "reasoning":  f"Fallback — {llm_name} error: {str(e)[:80]}",
+                    })
+            # ────────────────────────────────────────────���────────────────────
+            # small delay between batches to respect rate limits
+            if bi < len(batch_starts) - 1:
+                time.sleep(8)
         for item in all_labels:
             cid = int(item.get("cluster_id", 0))
             persona_results[pi][cid] = item
+        # delay between council members (Groq is fast, Mistral/OpenAI need breathing room)
+        if pi < len(COUNCIL) - 1:
+            time.sleep(10)
+    # ─────────────────────────────────────────────────────────────────────────
+    # ── UNCHANGED from here down: voting + enrichment + CSV export ───────────
     def enrich(cluster):
         cid = cluster["cluster_id"]
+        raw_votes = [
+            str(persona_results[pi].get(cid, {}).get("label", "")).strip()
+            for pi in range(3)
+        ]
+        votes = [
+            v if v and v.lower() not in ("", "none", "null")
+            else "Cluster {}".format(cid)
+            for v in raw_votes
+        ]
+        final     = _mode_label(votes)
+        agreement = (
+            "unanimous" if len(set(votes)) == 1
+            else "majority" if votes.count(final) >= 2
+            else "split"
+        )
         return {
             **cluster,
             "label":                  final,
+            "llm_vote_1_MISTRAL":     votes[0],   # key renamed to match real model
+            "llm_vote_2_OPENAI":      votes[1],   # key renamed to match real model
+            "llm_vote_3_GROQ":        votes[2],   # key renamed to match real model
             "confidence_1": persona_results[0].get(cid, {}).get("confidence", ""),
             "confidence_2": persona_results[1].get(cid, {}).get("confidence", ""),
             "confidence_3": persona_results[2].get(cid, {}).get("confidence", ""),
         cid = c["cluster_id"]
         for li, paper in enumerate(c["papers"]):
             rows.append({
+                "cluster_id":             cid,
+                "final_label":            c["label"],
+                "vote_agreement":         c["vote_agreement"],
+                "llm1_MISTRAL_label":     c["llm_vote_1_MISTRAL"],   # renamed
+                "llm2_OPENAI_label":      c["llm_vote_2_OPENAI"],    # renamed
+                "llm3_GROQ_label":        c["llm_vote_3_GROQ"],      # renamed
+                "llm1_confidence":        c["confidence_1"],
+                "llm2_confidence":        c["confidence_2"],
+                "llm3_confidence":        c["confidence_3"],
+                "llm1_reasoning":         c["reasoning_1"],
+                "llm2_reasoning":         c["reasoning_2"],
+                "llm3_reasoning":         c["reasoning_3"],
+                "paper_doi":              paper.get("doi", ""),
+                "paper_title":            paper.get("title", ""),
+                "paper_year":             paper.get("year", ""),
+                "paper_journal":          paper.get("journal", ""),
+                "abstract_preview":       paper.get("abstract", "")[:300],
+                "combined_preview":       paper.get("combined", "")[:200],
+                "centroid_cosine_sim":    round(float(
                     c["centroid_sims"][li] if li < len(c["centroid_sims"]) else 0.0), 4),
+                "hdbscan_probability":    round(float(
                     c["hdbscan_probs"][li] if li < len(c["hdbscan_probs"]) else 0.0), 4),
+                "is_top3_centroid":       "YES" if li in c["top3_paper_idx"] else "no",
             })
     pd.DataFrame(rows).to_csv(p["audit_csv"], index=False, encoding="utf-8-sig")
         "majority":         majority,
         "split":            len(enriched) - unanimous - majority,
         "audit_csv_rows":   len(rows),
+        "council_members":  [m["name"] for m in COUNCIL],           # NEW: visible in output
+        "note": "True 3-LLM ensemble (Mistral+OpenAI+Groq). Audit CSV ready ({} rows).".format(len(rows)),
     })