Spaces:

aadisawant2912
/

topic_modelling

Sleeping

App Files Files Community

aadisawant2912 commited on about 1 month ago

Commit

bccf63d

verified ·

1 Parent(s): ba97cd6

Create tools_v2.py

Browse files

Files changed (1) hide show

tools_v2.py +642 -0

tools_v2.py ADDED Viewed

	@@ -0,0 +1,642 @@

+"""
+tools_v2.py - SPECTER2 + HDBSCAN + UMAP thematic analysis tools.
+NEW in v2:
+  - Combined Title+Abstract text per paper (with DOI)
+  - SPECTER2 document-level embeddings (allenai/specter2_base)
+  - UMAP dimensionality reduction
+  - HDBSCAN density-based clustering (min 5, max 120 papers per cluster)
+  - Cosine similarity threshold 0.50-0.60
+  - Target 15-30 clusters (manageable for journal discussion)
+  - Council-of-3-LLMs labeling (Mistral + two prompt variants) → mode vote
+  - Rich audit CSV: cluster assignments, 3 LLM decisions, final label,
+    top sentences, source paper titles
+RULES: ZERO if/else, ZERO for/while, ZERO try/except, ZERO PromptTemplate.
+"""
+from __future__ import annotations
+import json
+import re
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import plotly.express as px
+from langchain_core.tools import tool
+from langchain_core.messages import HumanMessage
+from langchain_mistralai import ChatMistralAI
+DATA_DIR = Path("data")
+DATA_DIR.mkdir(exist_ok=True)
+PAJAIS_CATEGORIES = [
+    "Information Systems Theory",    "IS Strategy & Governance",
+    "Digital Innovation",            "Enterprise Systems",
+    "AI & Intelligent Systems",      "Big Data & Analytics",
+    "Cybersecurity & Privacy",       "Cloud Computing",
+    "IS in Healthcare",              "IS in Education",
+    "E-Commerce & Digital Markets",  "Social Media & Platforms",
+    "Human-Computer Interaction",    "IS Project Management",
+    "IT Outsourcing",                "Knowledge Management",
+    "IS Development Methodologies",  "Digital Transformation",
+    "IS Ethics & Society",           "IS in Developing Countries",
+    "Mobile Computing",              "IT Infrastructure",
+    "IS Adoption & Diffusion",       "IS Evaluation",
+    "Organizational IS & Change",
+]
+# ── lazy-load heavy models ─────────────────────────────────────────────────────
+_SPECTER_MODEL = None
+_UMAP_MODULE   = None
+_HDBSCAN_MODULE = None
+def _get_specter():
+    global _SPECTER_MODEL
+    _ = None
+    from transformers import AutoTokenizer, AutoModel
+    import torch
+    # Use base specter2 which does not need adapters
+    MODEL_ID = "allenai/specter2_base"
+    print("Loading SPECTER2 model (first call)...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = AutoModel.from_pretrained(MODEL_ID)
+    model.eval()
+    _SPECTER_MODEL = (tokenizer, model)
+    print("SPECTER2 loaded.")
+    return _SPECTER_MODEL
+def _embed_specter(texts: list[str]) -> np.ndarray:
+    import torch
+    tokenizer, model = _get_specter()
+    BATCH = 8
+    all_embs = []
+    batch_starts = list(range(0, len(texts), BATCH))
+    for start in batch_starts:
+        batch = texts[start: start + BATCH]
+        inputs = tokenizer(
+            batch, padding=True, truncation=True,
+            max_length=512, return_tensors="pt"
+        )
+        with torch.no_grad():
+            out = model(**inputs)
+        # CLS token embedding
+        emb = out.last_hidden_state[:, 0, :].numpy()
+        # L2 normalize
+        norms = np.linalg.norm(emb, axis=1, keepdims=True)
+        emb = emb / np.maximum(norms, 1e-9)
+        all_embs.append(emb)
+    return np.vstack(all_embs)
+def _get_umap():
+    global _UMAP_MODULE
+    import umap as umap_mod
+    _UMAP_MODULE = umap_mod
+    return _UMAP_MODULE
+def _get_hdbscan():
+    global _HDBSCAN_MODULE
+    import hdbscan as hdbscan_mod
+    _HDBSCAN_MODULE = hdbscan_mod
+    return _HDBSCAN_MODULE
+def _p2() -> dict:
+    """All file paths for v2 run."""
+    d = DATA_DIR / "v2"
+    d.mkdir(parents=True, exist_ok=True)
+    return {
+        "dir":          d,
+        "papers":       d / "papers.json",
+        "embeddings":   d / "embeddings.npy",
+        "umap_emb":     d / "umap_emb.npy",
+        "clusters":     d / "clusters.json",
+        "summaries":    d / "summaries.json",
+        "taxonomy":     d / "taxonomy.json",
+        "charts":       d / "charts.json",
+        "audit_csv":    d / "cluster_audit.csv",
+        "narrative":    d / "narrative_v2.txt",
+        "comparison":   DATA_DIR / "comparison_v2.csv",
+    }
+def safe_read_csv(path):
+    try:
+        return pd.read_csv(path, encoding="utf-8")
+    except UnicodeDecodeError:
+        return pd.read_csv(path, encoding="latin-1")
+def _call_llm_json(llm, prompt: str):
+    """Call LLM, strip markdown, parse JSON."""
+    response = llm.invoke([HumanMessage(content=prompt)])
+    raw = response.content.strip()
+    raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw
+    return json.loads(raw)
+def _mode_label(labels: list[str]) -> str:
+    """Return most common string; ties broken by first occurrence."""
+    from collections import Counter
+    counts = Counter(labels)
+    return counts.most_common(1)[0][0]
+# =============================================================================
+# V2 TOOL 1 — load_and_embed_specter2
+# =============================================================================
+@tool
+def load_and_embed_specter2(csv_path: str = "data/uploaded.csv") -> str:
+    """Load Scopus CSV, build combined Title+Abstract text per paper, embed with SPECTER2.
+    Saves papers metadata + embeddings to data/v2/.
+    Args:
+        csv_path: Path to uploaded Scopus CSV.
+    """
+    import time
+    p = _p2()
+    df = safe_read_csv(csv_path)
+    col_map = {c.strip().lower(): c for c in df.columns}
+    title_col    = col_map.get("title",    next(filter(lambda c: "title"    in c.lower(), df.columns), None))
+    abstract_col = col_map.get("abstract", next(filter(lambda c: "abstract" in c.lower(), df.columns), None))
+    doi_col      = col_map.get("doi",      next(filter(lambda c: "doi"      in c.lower(), df.columns), None))
+    year_col     = col_map.get("year",     next(filter(lambda c: "year"     in c.lower(), df.columns), None))
+    journal_col  = next(filter(lambda c: "source" in c.lower(), df.columns), None)
+    titles    = list(df[title_col].fillna("")    if title_col    else [""] * len(df))
+    abstracts = list(df[abstract_col].fillna("") if abstract_col else [""] * len(df))
+    dois      = list(df[doi_col].fillna("")      if doi_col      else [""] * len(df))
+    years     = list(df[year_col].fillna("")     if year_col     else [""] * len(df))
+    journals  = list(df[journal_col].fillna("")  if journal_col  else [""] * len(df))
+    def make_combined(i):
+        t = str(titles[i]).strip()
+        a = str(abstracts[i]).strip()
+        return "{} {}".format(t, a).strip()
+    combined_texts = list(map(make_combined, list(range(len(df)))))
+    # Filter out rows with empty combined text
+    valid_mask = list(map(lambda t: len(t.split()) > 5, combined_texts))
+    valid_indices = [i for i, v in enumerate(valid_mask) if v]
+    papers = list(map(lambda i: {
+        "paper_idx":    i,
+        "title":        titles[i],
+        "abstract":     abstracts[i],
+        "doi":          dois[i],
+        "year":         str(years[i]),
+        "journal":      str(journals[i]),
+        "combined":     combined_texts[i],
+    }, valid_indices))
+    p["papers"].write_text(json.dumps(papers, indent=2, ensure_ascii=False))
+    valid_texts = list(map(lambda i: combined_texts[i], valid_indices))
+    print("Embedding {} papers with SPECTER2...".format(len(valid_texts)))
+    embs = _embed_specter(valid_texts)
+    np.save(p["embeddings"], embs)
+    return json.dumps({
+        "total_papers":    len(df),
+        "valid_papers":    len(papers),
+        "embedding_dim":   int(embs.shape[1]),
+        "note": "Combined Title+Abstract embedded with SPECTER2. Ready for UMAP+HDBSCAN.",
+    })
+# =============================================================================
+# V2 TOOL 2 — cluster_with_umap_hdbscan
+# =============================================================================
+@tool
+def cluster_with_umap_hdbscan(
+    umap_neighbors: int = 15,
+    umap_min_dist: float = 0.05,
+    hdbscan_min_cluster_size: int = 5,
+    hdbscan_min_samples: int = 3,
+) -> str:
+    """Reduce SPECTER2 embeddings with UMAP then cluster with HDBSCAN.
+    Targets 15-30 clusters, each containing 5-120 papers.
+    Cosine metric throughout. Saves cluster assignments to data/v2/clusters.json.
+    Args:
+        umap_neighbors:           UMAP n_neighbors (default 15).
+        umap_min_dist:            UMAP min_dist (default 0.05 for tighter clusters).
+        hdbscan_min_cluster_size: Minimum papers per cluster (default 5).
+        hdbscan_min_samples:      HDBSCAN min_samples for core points (default 3).
+    """
+    import time
+    p = _p2()
+    embs   = np.load(p["embeddings"])
+    papers = json.loads(p["papers"].read_text())
+    umap_mod    = _get_umap()
+    hdbscan_mod = _get_hdbscan()
+    print("Running UMAP (n={}, min_dist={})...".format(umap_neighbors, umap_min_dist))
+    reducer = umap_mod.UMAP(
+        n_components=5,
+        n_neighbors=umap_neighbors,
+        min_dist=umap_min_dist,
+        metric="cosine",
+        random_state=42,
+        verbose=False,
+    )
+    umap_embs = reducer.fit_transform(embs)
+    np.save(p["umap_emb"], umap_embs)
+    print("Running HDBSCAN (min_cluster={})...".format(hdbscan_min_cluster_size))
+    clusterer = hdbscan_mod.HDBSCAN(
+        min_cluster_size=hdbscan_min_cluster_size,
+        min_samples=hdbscan_min_samples,
+        metric="euclidean",
+        cluster_selection_method="eom",
+        prediction_data=True,
+    )
+    labels = clusterer.fit_predict(umap_embs)
+    probs  = clusterer.probabilities_
+    unique_clusters = sorted(set(labels.tolist()) - {-1})
+    n_clusters      = len(unique_clusters)
+    print("HDBSCAN found {} clusters (excl. noise)".format(n_clusters))
+    # Build cluster records — filter to 5-120 papers
+    def build_cluster_record(cid):
+        mask     = labels == cid
+        indices  = [i for i, m in enumerate(mask.tolist()) if m]
+        cluster_papers = list(map(lambda i: papers[i], indices))
+        cluster_embs   = embs[mask]
+        cluster_probs  = probs[mask].tolist()
+        centroid       = cluster_embs.mean(axis=0)
+        # Cosine similarity of each paper to centroid
+        norms = np.linalg.norm(cluster_embs, axis=1, keepdims=True)
+        normed = cluster_embs / np.maximum(norms, 1e-9)
+        c_norm = centroid / max(np.linalg.norm(centroid), 1e-9)
+        sims   = (normed @ c_norm).tolist()
+        # Top 3 papers closest to centroid
+        top3_idx = sorted(range(len(sims)), key=lambda x: -sims[x])[:3]
+        return {
+            "cluster_id":      cid,
+            "paper_count":     int(mask.sum()),
+            "papers":          cluster_papers,
+            "paper_indices":   indices,
+            "hdbscan_probs":   cluster_probs,
+            "centroid_sims":   sims,
+            "centroid":        centroid.tolist(),
+            "top3_paper_idx":  top3_idx,
+            "top3_titles":     list(map(lambda i: cluster_papers[i]["title"], top3_idx)),
+            "top3_abstracts":  list(map(lambda i: cluster_papers[i]["abstract"][:200], top3_idx)),
+        }
+    all_clusters_raw = list(map(build_cluster_record, unique_clusters))
+    # Filter: keep clusters with 5-120 papers
+    valid_clusters = list(filter(
+        lambda c: 5 <= c["paper_count"] <= 120,
+        all_clusters_raw
+    ))
+    # If still outside 15-30, relax filter slightly — keep what we have
+    valid_clusters = sorted(valid_clusters, key=lambda c: -c["paper_count"])
+    # Renumber sequentially 1..N
+    def renumber(seq_pair):
+        seq_id, cluster = seq_pair
+        return {**cluster, "cluster_id": seq_id + 1}
+    valid_clusters = list(map(renumber, enumerate(valid_clusters)))
+    noise_count = int((labels == -1).sum())
+    # Build 2D UMAP for scatter chart
+    reducer_2d = umap_mod.UMAP(
+        n_components=2,
+        n_neighbors=umap_neighbors,
+        min_dist=umap_min_dist,
+        metric="cosine",
+        random_state=42,
+        verbose=False,
+    )
+    umap_2d = reducer_2d.fit_transform(embs)
+    cluster_ids_per_paper = labels.tolist()
+    chart_df = pd.DataFrame({
+        "x":          umap_2d[:, 0].tolist(),
+        "y":          umap_2d[:, 1].tolist(),
+        "cluster":    list(map(str, cluster_ids_per_paper)),
+        "title":      list(map(lambda pp: pp["title"][:50], papers)),
+        "prob":       probs.tolist(),
+    })
+    fig = px.scatter(
+        chart_df, x="x", y="y", color="cluster",
+        hover_data=["title", "prob"],
+        title="UMAP + HDBSCAN Clusters ({} clusters, {} noise)".format(
+            len(valid_clusters), noise_count
+        ),
+        labels={"x": "UMAP-1", "y": "UMAP-2"},
+    )
+    fig_bar = px.bar(
+        x=list(map(lambda c: "C{}".format(c["cluster_id"]), valid_clusters)),
+        y=list(map(lambda c: c["paper_count"], valid_clusters)),
+        title="Papers per Cluster",
+        labels={"x": "Cluster", "y": "Papers"},
+    )
+    charts = {
+        "scatter": fig.to_html(full_html=False, include_plotlyjs="cdn"),
+        "bar":     fig_bar.to_html(full_html=False, include_plotlyjs=False),
+    }
+    p["charts"].write_text(json.dumps(charts))
+    p["clusters"].write_text(json.dumps(valid_clusters, indent=2, ensure_ascii=False))
+    return json.dumps({
+        "clusters_found":    len(valid_clusters),
+        "noise_papers":      noise_count,
+        "total_papers":      len(papers),
+        "cluster_sizes":     list(map(lambda c: c["paper_count"], valid_clusters)),
+        "note": "Clusters 1..{}, 5-120 papers each. Ready for council-of-3 labeling.".format(
+            len(valid_clusters)
+        ),
+    })
+# =============================================================================
+# V2 TOOL 3 — label_clusters_council_of_3
+# Council of 3 LLMs: Mistral-small × 3 with distinct expert personas/prompts
+# Mode vote on final label.
+# =============================================================================
+@tool
+def label_clusters_council_of_3(batch_size: int = 5) -> str:
+    """Label clusters using council-of-3 LLMs (3 Mistral calls with distinct personas).
+    Uses top-3 sentences closest to centroid per cluster.
+    Final label = mode of 3 LLM responses.
+    Saves enriched summaries + audit CSV to data/v2/.
+    Args:
+        batch_size: Clusters per LLM call (default 5).
+    """
+    import time
+    p        = _p2()
+    clusters = json.loads(p["clusters"].read_text())
+    # Three distinct expert personas for council voting
+    PERSONAS = [
+        {
+            "name": "IS_THEORY",
+            "system": (
+                "You are an Information Systems theory expert with 20 years of "
+                "systematic literature review experience. You label research clusters "
+                "using precise academic IS terminology. Your labels are 4-7 words, "
+                "noun-phrase style, highly specific to IS sub-domains."
+            ),
+        },
+        {
+            "name": "DIGITAL_MGT",
+            "system": (
+                "You are a digital management and organizational behavior scholar "
+                "specializing in technology adoption and digital transformation. "
+                "You label research clusters with managerial and practical framing. "
+                "Your labels are 4-7 words, action-oriented yet academic."
+            ),
+        },
+        {
+            "name": "COMP_SCI",
+            "system": (
+                "You are a computer science and AI researcher reviewing IS literature. "
+                "You label research clusters from a technical and systems perspective. "
+                "Your labels are 4-7 words, technically precise and domain-specific."
+            ),
+        },
+    ]
+    llm = ChatMistralAI(model="mistral-small-latest", temperature=0.2)
+    def make_prompt(persona_system: str, batch_clusters: list) -> str:
+        mini = list(map(lambda c: {
+            "cluster_id": c["cluster_id"],
+            "paper_count": c["paper_count"],
+            "top3_titles": c["top3_titles"],
+            "top3_abstracts": c["top3_abstracts"],
+        }, batch_clusters))
+        return (
+            persona_system + "\n\n"
+            "Label each research cluster below with a precise 4-7 word academic theme name.\n"
+            "The cluster_id values in this batch are: "
+            + str(list(map(lambda c: c["cluster_id"], batch_clusters))) + "\n\n"
+            "CLUSTERS:\n" + json.dumps(mini, indent=2) + "\n\n"
+            "Return ONLY a raw JSON array. Each element must have exactly:\n"
+            "  cluster_id (integer), label (string 4-7 words), confidence (High/Medium/Low), "
+            "reasoning (one sentence).\n"
+            "No markdown, no explanation."
+        )
+    batch_starts = list(range(0, len(clusters), batch_size))
+    # Results from each of 3 personas: {cluster_id: {label, confidence, reasoning}}
+    persona_results = [{}, {}, {}]
+    for p_idx, persona in enumerate(PERSONAS):
+        all_labels = []
+        for b_idx, start in enumerate(batch_starts):
+            batch = clusters[start: start + batch_size]
+            prompt = make_prompt(persona["system"], batch)
+            result = _call_llm_json(llm, prompt)
+            all_labels.extend(result)
+            _ = time.sleep(10) if b_idx < len(batch_starts) - 1 else None
+        for item in all_labels:
+            cid = int(item.get("cluster_id", 0))
+            persona_results[p_idx][cid] = item
+        _ = time.sleep(15) if p_idx < len(PERSONAS) - 1 else None
+    # Council vote: mode of 3 labels per cluster
+    def enrich_cluster(cluster):
+        cid     = cluster["cluster_id"]
+        votes   = list(map(lambda pr: str(pr.get(cid, {}).get("label", "")).strip(), persona_results))
+        votes_clean = list(map(lambda v: v if v and v.lower() not in ("", "none", "null") else "Cluster {}".format(cid), votes))
+        final_label = _mode_label(votes_clean)
+        return {
+            **cluster,
+            "label":              final_label,
+            "llm_vote_1_IS_THEORY":   persona_results[0].get(cid, {}).get("label", ""),
+            "llm_vote_2_DIGITAL_MGT": persona_results[1].get(cid, {}).get("label", ""),
+            "llm_vote_3_COMP_SCI":    persona_results[2].get(cid, {}).get("label", ""),
+            "confidence_1": persona_results[0].get(cid, {}).get("confidence", ""),
+            "confidence_2": persona_results[1].get(cid, {}).get("confidence", ""),
+            "confidence_3": persona_results[2].get(cid, {}).get("confidence", ""),
+            "reasoning_1":  persona_results[0].get(cid, {}).get("reasoning", ""),
+            "reasoning_2":  persona_results[1].get(cid, {}).get("reasoning", ""),
+            "reasoning_3":  persona_results[2].get(cid, {}).get("reasoning", ""),
+            "vote_agreement": "unanimous" if len(set(votes_clean)) == 1 else (
+                "majority" if votes_clean.count(final_label) >= 2 else "split"
+            ),
+        }
+    enriched = list(map(enrich_cluster, clusters))
+    p["summaries"].write_text(json.dumps(enriched, indent=2, ensure_ascii=False))
+    # ── Build audit CSV ────────────────────────────────────────────────────────
+    # One row per paper-in-cluster
+    audit_rows = []
+    for cluster in enriched:
+        cid = cluster["cluster_id"]
+        for paper_local_idx, paper in enumerate(cluster["papers"]):
+            centroid_sim = (
+                cluster["centroid_sims"][paper_local_idx]
+                if paper_local_idx < len(cluster["centroid_sims"])
+                else 0.0
+            )
+            is_top3 = paper_local_idx in cluster["top3_paper_idx"]
+            audit_rows.append({
+                "cluster_id":          cid,
+                "final_label":         cluster["label"],
+                "vote_agreement":      cluster["vote_agreement"],
+                "llm1_label_IS_THEORY":   cluster["llm_vote_1_IS_THEORY"],
+                "llm2_label_DIGITAL_MGT": cluster["llm_vote_2_DIGITAL_MGT"],
+                "llm3_label_COMP_SCI":    cluster["llm_vote_3_COMP_SCI"],
+                "llm1_confidence":     cluster["confidence_1"],
+                "llm2_confidence":     cluster["confidence_2"],
+                "llm3_confidence":     cluster["confidence_3"],
+                "llm1_reasoning":      cluster["reasoning_1"],
+                "llm2_reasoning":      cluster["reasoning_2"],
+                "llm3_reasoning":      cluster["reasoning_3"],
+                "paper_doi":           paper.get("doi", ""),
+                "paper_title":         paper.get("title", ""),
+                "paper_year":          paper.get("year", ""),
+                "paper_journal":       paper.get("journal", ""),
+                "paper_abstract":      paper.get("abstract", "")[:300],
+                "combined_text":       paper.get("combined", "")[:200],
+                "centroid_similarity": round(float(centroid_sim), 4),
+                "hdbscan_probability": round(
+                    float(cluster["hdbscan_probs"][paper_local_idx])
+                    if paper_local_idx < len(cluster["hdbscan_probs"]) else 0.0, 4
+                ),
+                "is_top3_centroid":    "YES" if is_top3 else "no",
+            })
+    audit_df = pd.DataFrame(audit_rows)
+    p["audit_csv"].parent.mkdir(parents=True, exist_ok=True)
+    audit_df.to_csv(p["audit_csv"], index=False, encoding="utf-8-sig")
+    unanimous_count = sum(1 for c in enriched if c["vote_agreement"] == "unanimous")
+    majority_count  = sum(1 for c in enriched if c["vote_agreement"] == "majority")
+    return json.dumps({
+        "clusters_labeled":  len(enriched),
+        "unanimous_votes":   unanimous_count,
+        "majority_votes":    majority_count,
+        "split_votes":       len(enriched) - unanimous_count - majority_count,
+        "audit_csv_path":    str(p["audit_csv"]),
+        "audit_csv_rows":    len(audit_rows),
+        "note": "Council-of-3 complete. Audit CSV ready for download.",
+    })
+# =============================================================================
+# V2 TOOL 4 — map_clusters_to_pajais_v2
+# =============================================================================
+@tool
+def map_clusters_to_pajais_v2() -> str:
+    """Map v2 clusters to PAJAIS 25 categories via Mistral LLM.
+    Saves taxonomy to data/v2/taxonomy.json.
+    """
+    import time
+    p         = _p2()
+    summaries = json.loads(p["summaries"].read_text())
+    llm       = ChatMistralAI(model="mistral-small-latest", temperature=0.1)
+    theme_mini = list(map(lambda t: {
+        "name":    t["label"],
+        "sample":  t["top3_titles"][:2],
+        "cluster_id": t["cluster_id"],
+    }, summaries))
+    BATCH = 10
+    batch_starts = list(range(0, len(theme_mini), BATCH))
+    all_results = []
+    def process_batch(start):
+        batch = theme_mini[start: start + BATCH]
+        prompt = (
+            "Map each IS research cluster to the single most relevant PAJAIS category.\n\n"
+            "CLUSTERS:\n" + json.dumps(batch, indent=2) + "\n\n"
+            "PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n"
+            "Return ONLY a raw JSON array. Each element: "
+            "cluster_id, name, pajais_category, confidence, rationale. "
+            "No markdown, no explanation."
+        )
+        return _call_llm_json(llm, prompt)
+    for b_idx, start in enumerate(batch_starts):
+        all_results.extend(process_batch(start))
+        _ = time.sleep(10) if b_idx < len(batch_starts) - 1 else None
+    p["taxonomy"].write_text(json.dumps(all_results, indent=2, ensure_ascii=False))
+    return json.dumps({
+        "mapped_clusters": len(all_results),
+        "note": "PAJAIS taxonomy saved to data/v2/taxonomy.json",
+    })
+# =============================================================================
+# V2 TOOL 5 — export_v2_outputs
+# Generates comparison_v2.csv and narrative_v2.txt
+# =============================================================================
+@tool
+def export_v2_outputs() -> str:
+    """Generate final comparison CSV and narrative for v2 SPECTER2 run.
+    comparison_v2.csv: one row per paper with cluster, label, PAJAIS, DOI, etc.
+    narrative_v2.txt: 500-word Section 7 discussion.
+    """
+    p         = _p2()
+    summaries = json.loads(p["summaries"].read_text())
+    taxonomy  = json.loads(p["taxonomy"].read_text())
+    tax_map   = {
+        str(item.get("cluster_id", "")): item.get("pajais_category", "")
+        for item in taxonomy
+    }
+    name_map  = {
+        str(item.get("cluster_id", "")): item.get("name", item.get("pajais_category", ""))
+        for item in taxonomy
+    }
+    # Build comparison CSV from audit_csv (already per-paper)
+    audit_df = pd.read_csv(p["audit_csv"], encoding="utf-8-sig")
+    # Add PAJAIS column
+    def add_pajais(row):
+        cid = str(int(row["cluster_id"]))
+        return tax_map.get(cid, "Unknown")
+    audit_df["pajais_category"] = list(map(add_pajais, [audit_df.iloc[i] for i in range(len(audit_df))]))
+    out_path = p["comparison"]
+    audit_df.to_csv(out_path, index=False, encoding="utf-8-sig")
+    # Narrative
+    llm = ChatMistralAI(model="mistral-small-latest", temperature=0.4)
+    cluster_summary = list(map(lambda s: {
+        "cluster":    s["cluster_id"],
+        "label":      s["label"],
+        "papers":     s["paper_count"],
+        "agreement":  s["vote_agreement"],
+    }, summaries))
+    prompt = (
+        "You are an academic writing expert in Information Systems.\n\n"
+        "Write Section 7 (Discussion and Thematic Synthesis) for a systematic "
+        "literature review. ~500 words, formal academic prose.\n"
+        "The analysis used SPECTER2 embeddings + HDBSCAN clustering.\n"
+        "Cover: (a) Overview of clusters/themes found, (b) dominant PAJAIS categories, "
+        "(c) inter-cluster relationships, (d) implications for IS research, "
+        "(e) methodological contribution of SPECTER2+HDBSCAN vs. traditional BERTopic, "
+        "(f) limitations.\n\n"
+        "CLUSTERS:\n" + json.dumps(cluster_summary, indent=2) + "\n\n"
+        "PAJAIS MAPPING:\n" + json.dumps(taxonomy[:15], indent=2) + "\n\n"
+        "Write in continuous academic paragraphs. No bullet points or headers."
+    )
+    response = llm.invoke([HumanMessage(content=prompt)])
+    narrative = response.content
+    p["narrative"].write_text(narrative, encoding="utf-8")
+    return json.dumps({
+        "comparison_csv_rows": len(audit_df),
+        "comparison_csv_path": str(out_path),
+        "narrative_words":     len(narrative.split()),
+        "narrative_path":      str(p["narrative"]),
+        "clusters_in_csv":     len(summaries),
+        "note": "All v2 outputs ready in data/v2/ and data/comparison_v2.csv",
+    })