BERTopic_AG_final

Sleeping

App Files Files Community

anujjuna commited on 13 days ago

Commit

d8fd287

verified ·

1 Parent(s): 05df72c

Update agent.py

Browse files

Files changed (1) hide show

agent.py +314 -427

agent.py CHANGED Viewed

@@ -1,19 +1,25 @@
 """
 agent.py
 --------
-LLM-driven topic interpretation and classification module using a 3-LLM ensemble.
 """
 from __future__ import annotations
 import json
 import logging
 import os
 import time
-from dataclasses import dataclass, asdict
 from typing import Optional
 import pandas as pd
 import requests
-import re
 from groq import Groq
 # ---------------------------------------------------------------------------
@@ -25,503 +31,384 @@ logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
-DEFAULT_MODEL = "llama-3.1-8b-instant"
-MISTRAL_DEFAULT_MODEL = "mistral-small-latest"
-DEFAULT_TAXONOMY_CATEGORIES = [
-    "Artificial Intelligence", "Machine Learning", "Natural Language Processing",
-    "Computer Vision", "Information Systems", "Healthcare & Bioinformatics",
-    "Finance & Economics", "Cybersecurity", "Human-Computer Interaction",
-    "Robotics & Automation", "Education Technology", "Environmental Science",
     "Social Sciences", "Data Engineering", "Other",
 ]
 # ---------------------------------------------------------------------------
-# PAJAIS 2019 Knowledge — what the 2019 taxonomy covers vs does NOT cover
 # ---------------------------------------------------------------------------
-PAJAIS_COVERED = [
-    "IS strategy", "IS adoption", "IS governance", "e-commerce", "enterprise systems",
-    "ERP", "knowledge management", "decision support", "e-government", "social media IS",
-    "IT outsourcing", "IS security", "privacy", "IS education", "mobile commerce",
-    "business intelligence", "data analytics", "IS in healthcare (general)",
-    "human computer interaction", "HCI", "IT project management",
-]
-PAJAIS_NOT_COVERED = [
-    "large language models", "LLM", "GPT", "generative AI", "RAG",
-    "process mining", "event log", "Petri net", "conformance checking",
-    "federated learning", "differential privacy", "DP-SGD",
-    "fairness", "algorithmic bias", "responsible AI", "FATE", "XAI", "explainable AI",
-    "blockchain analytics", "smart contract", "DeFi", "tokenomics",
-    "COVID-19 IS", "pandemic informatics",
-    "Android malware", "mobile security", "dark web", "cyber insurance",
-    "agentic AI", "multi-agent orchestration",
-    "transformer", "BERT", "neural topic model", "BERTopic",
-    "recommender neural", "graph neural network", "GNN",
-    "heterogeneous computing", "IoT analytics", "edge computing IS",
-    "talent matching", "job-person fit", "HR analytics",
-]
-# Rule-based NOVEL trigger — fires ONLY on specific, unambiguous compound/technical terms
-# that are definitively absent from PAJAIS 2019.
-# Deliberately narrow: single common words like "data", "model", "network", "learning",
-# "deep", "smart", "financial", "detection" do NOT trigger this — they exist in PAJAIS.
-# Only truly post-2018 or PAJAIS-absent compound terms qualify.
-NOVEL_REGEX_TRIGGERS = re.compile(
-    r'\b('
-    r'llms?|gpt[\-\s]?\d*|large\s+language\s+model|generative\s+ai|'
-    r'federat\w*\s+learn\w*|differential\s+privac\w*|dp\-sgd|'
-    r'process\s+mining|event\s+log|petri\s+net|conformance\s+check\w*|'
-    r'blockchain|smart\s+contract|defi\b|tokenomic\w*|'
-    r'malware|botnet|dark\s+web|cyber\s+insur\w*|'
-    r'responsible\s+ai|explainab\w*\s+ai|algorithmic\s+bias|xai\b|'
-    r'agentic\s+ai|multi.agent\s+orchest\w*|'
-    r'graph\s+neural\s+network|gnn\b|'
-    r'retrieval.augment\w*|prompt\s+engineer\w*|rag\b|'
-    r'talent\s+match\w*|job.person\s+fit|'
-    r'covid.19|pandemic\s+inform\w*'
-    r')\b',
-    re.IGNORECASE
-)
-def _is_deterministic_novel(keywords: list[str], samples: list[str]) -> bool:
-    """Non-LLM rule-based check: fires only on specific unambiguous NOVEL compound terms.
-    Generic single words (data, model, network, learning, detection) do NOT trigger this.
-    The keyword list from BERTopic is checked word-by-word AND as joined text to catch
-    compound matches that span two keywords."""
-    # Check the joined keyword string (catches "process mining" split across two keywords)
-    keyword_text = " ".join(keywords).lower()
-    sample_text  = " ".join(samples).lower()
-    return (
-        bool(NOVEL_REGEX_TRIGGERS.search(keyword_text)) or
-        bool(NOVEL_REGEX_TRIGGERS.search(sample_text))
-    )
-# ---------------------------------------------------------------------------
-# Data Classes
-# ---------------------------------------------------------------------------
 @dataclass
-class TopicInterpretation:
-    """Structured interpretation for a single topic."""
-    topic_id: int
-    label: str
-    category: str
-    classification: str
     paper_count: int = 0
-    keywords: list[str] = None
 # ---------------------------------------------------------------------------
-# API Clients & Calls
 # ---------------------------------------------------------------------------
 def build_groq_client(api_key: Optional[str] = None):
     key = api_key or os.getenv("GROQ_API_KEY")
     if not key:
-        raise ValueError("No Groq API key provided.")
     return Groq(api_key=key, max_retries=0)
-def call_gemini_label(prompt: str, api_key: str) -> dict:
-    """Call Google AI Studio (Gemini) API."""
-    if not api_key: return {}
-    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key={api_key}"
-    headers = {"Content-Type": "application/json"}
-    payload = {"contents": [{"parts": [{"text": prompt}]}], "generationConfig": {"temperature": 0.2}}
     try:
-        response = requests.post(url, headers=headers, json=payload, timeout=10)
-        data = response.json()
-        if "error" in data or "candidates" not in data:
-            logger.error(f"Gemini error / missing candidates. Response: {data}")
-            return {}
-        raw = data["candidates"][0]["content"]["parts"][0]["text"].strip()
-        raw = raw.replace("```json", "").replace("```", "").strip()
-        start = raw.find("{")
-        end = raw.rfind("}") + 1
-        if start != -1 and end != 0:
-            raw = raw[start:end]
-        return json.loads(raw)
     except Exception as e:
-        logger.warning(f"Gemini call failed: {e}")
         return {}
-def call_mistral_label(prompt: str, api_key: str) -> dict:
-    """Call Mistral API."""
-    if not api_key: return {}
     try:
-        response = requests.post(
             "https://api.mistral.ai/v1/chat/completions",
-            headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
-            json={
-                "model": "mistral-small-latest",
-                "messages": [{"role": "user", "content": prompt}],
-                "temperature": 0.2,
-            },
-            timeout=10,
         )
-        data = response.json()
-        raw = data["choices"][0]["message"]["content"].strip()
-        raw = raw.replace("```json", "").replace("```", "").strip()
-        start, end = raw.find("{"), raw.rfind("}") + 1
-        return json.loads(raw[start:end])
     except Exception as e:
-        logger.warning(f"Mistral call failed: {e}")
         return {}
-def _call_llm_json(client, prompt: str, model: str) -> dict:
-    """Call Groq API with robust JSON parsing."""
     try:
-        response = client.chat.completions.create(
-            model=model, messages=[{"role": "user", "content": prompt}], temperature=0.2, timeout=10,
-        )
-        raw = response.choices[0].message.content.strip()
-        raw = raw.replace("```json", "").replace("```", "").strip()
-        start = raw.find("{")
-        end = raw.rfind("}") + 1
-        if start != -1 and end != 0:
-            raw = raw[start:end]
-        return json.loads(raw)
     except Exception as e:
-        logger.warning(f"Groq call failed: {e}")
         return {}
-# ---------------------------------------------------------------------------
-# Logic Helpers
-# ---------------------------------------------------------------------------
-def convert_numpy_types(obj):
-    """Recursively convert numpy types to native Python types for JSON serialisation."""
-    import numpy as np
-    if isinstance(obj, dict):
-        return {k: convert_numpy_types(v) for k, v in obj.items()}
-    elif isinstance(obj, list):
-        return [convert_numpy_types(v) for v in obj]
-    elif isinstance(obj, np.integer):
-        return int(obj)
-    elif isinstance(obj, np.floating):
-        return float(obj)
-    return obj
-def _safe_capitalize(s: str) -> str:
-    s = str(s or "").strip()
-    return s[0].upper() + s[1:] if s else ""
-def clean_label(label: str) -> str:
-    if not label: return ""
-    label = label.replace("\n", " ").strip()
-    label = " ".join(label.split())
-    label = label.rstrip(" .")
-    if len(label) > 60:
-        label = label[:60].rsplit(" ", 1)[0] if " " in label[:60] else label[:60]
-    return label.strip()
-def _get_keyword_overlap(label: str, keywords: list[str]) -> int:
-    label_words = set(label.lower().split())
-    kw_set = set(k.lower() for k in keywords)
-    return len(label_words & kw_set)
-def select_best_interpretation(results: list[dict], keywords: list[str]) -> dict:
-    valid = [r for r in results if r and "label" in r]
-    if not valid: return {}
-    # Majority vote on label
-    counts = {}
-    for r in valid:
-        l = clean_label(r["label"]).lower()
-        counts[l] = counts.get(l, 0) + 1
-    for l, c in counts.items():
-        if c >= 2:
-            best_r = next(r for r in valid if clean_label(r["label"]).lower() == l)
-            best_r["label"] = clean_label(best_r["label"])
-            return best_r
-    # Fallback: keyword overlap or shortest
-    valid.sort(key=lambda x: (-_get_keyword_overlap(clean_label(x["label"]), keywords), len(clean_label(x["label"]))))
-    best_r = valid[0]
-    best_r["label"] = clean_label(best_r["label"])
-    return best_r
-def _fallback_label_from_keywords(keywords: list[str], topic_id: int) -> tuple[str, str]:
-    kw_set = set([k.lower() for k in keywords])
-    mappings = [
-        ({"privacy", "data", "security"}, "Digital Privacy and Security", "Cybersecurity"),
-        ({"ai", "chatbots", "agents"}, "Conversational AI", "Artificial Intelligence"),
-        ({"neural", "network", "deep"}, "Deep Learning Systems", "Machine Learning"),
-    ]
-    for trigger, label, cat in mappings:
-        if any(t in kw_set for t in trigger): return label, cat
-    return f"Topic study on {', '.join(keywords[:2])}", "Other"
 # ---------------------------------------------------------------------------
-# Core Logic — Prompt Builder
 # ---------------------------------------------------------------------------
-def _build_interpretation_prompt(keywords, samples, cats) -> str:
-    pajais_covered_str = "; ".join(PAJAIS_COVERED[:10])
-    pajais_not_str = "; ".join(PAJAIS_NOT_COVERED[:12])
-    return f"""You are an IS research classifier. A BERTopic algorithm produced the following topic cluster from ACM TMIS papers.
-KEYWORDS: {', '.join(keywords)}
-REPRESENTATIVE PAPER TITLES: {' | '.join(samples[:3])}
-TASK: Generate a label and classify this topic against the PAJAIS 2019 taxonomy.
-PAJAIS 2019 COVERS — use MAPPED only if the topic clearly fits one of these:
-{pajais_covered_str}
-PAJAIS 2019 DOES NOT COVER — use NOVEL if the topic fits here:
-{pajais_not_str}
-CLASSIFICATION RULES:
-- NOVEL if the topic involves: LLMs/GPT/generative AI, process mining, federated learning, differential privacy, fairness/XAI/responsible AI, blockchain analytics, COVID-19 IS, mobile malware, dark web, agentic AI, IoT analytics, talent matching, cyber insurance, or any technique that postdates 2018.
-- MAPPED only if it clearly fits an existing PAJAIS 2019 category listed above.
-- When in doubt, choose NOVEL. TMIS is a computational journal and most of its recent topics post-date the 2019 taxonomy.
-TAXONOMY CATEGORIES (for the taxonomy_category field only): {', '.join(cats)}
-Respond ONLY with valid JSON — no other text, no markdown fences:
 {{
-  "label": "<concise 5-8 word label>",
-  "taxonomy_category": "<one category from the list>",
-  "classification": "MAPPED or NOVEL",
-  "reasoning": "<one sentence explaining the MAPPED vs NOVEL decision>"
 }}"""
 # ---------------------------------------------------------------------------
-# Validation Method 2 — Regex / Pattern-based grounding check (non-LLM)
 # ---------------------------------------------------------------------------
-def validate_label_with_regex(label: str, keywords: list[str]) -> dict:
-    """
-    Checks if the AI-generated label is grounded in the cluster's actual keywords.
-    Returns a dict with overlap score, matched terms, and a PASS/FAIL verdict.
-    This method uses only Python re — no AI involved.
-    """
-    if not label or not keywords:
-        return {"verdict": "FAIL", "overlap_score": 0, "matched_terms": [], "reason": "Empty label or keywords"}
-    # Normalise: lowercase, split on word boundaries
-    label_tokens = set(re.findall(r'\b[a-z]{3,}\b', label.lower()))
-    kw_tokens = set(re.findall(r'\b[a-z]{3,}\b', " ".join(keywords).lower()))
-    # Remove common stop words that add noise
-    noise = {"the", "and", "for", "with", "using", "based", "from", "into", "this", "that", "are"}
-    label_tokens -= noise
-    kw_tokens -= noise
-    matched = list(label_tokens & kw_tokens)
-    overlap_score = len(matched) / max(len(label_tokens), 1)
-    # Stem-level match: check if any label token is a prefix (>=4 chars) of a keyword or vice versa
-    stem_matches = []
-    for lt in label_tokens:
-        for kt in kw_tokens:
             if len(lt) >= 4 and (kt.startswith(lt[:4]) or lt.startswith(kt[:4])):
-                stem_matches.append(f"{lt}≈{kt}")
-    total_score = min(1.0, overlap_score + 0.15 * len(stem_matches))
-    verdict = "PASS" if (len(matched) >= 1 or len(stem_matches) >= 1) else "FAIL"
-    return {
-        "verdict": verdict,
-        "overlap_score": round(total_score, 3),
-        "matched_terms": matched,
-        "stem_matches": stem_matches[:5],
-        "label_tokens": list(label_tokens),
-        "reason": f"{len(matched)} exact + {len(stem_matches)} stem matches against {len(kw_tokens)} keyword tokens",
-    }
 # ---------------------------------------------------------------------------
-# Core — Topic Interpretation with 3-LLM Council + dual validation
 # ---------------------------------------------------------------------------
-def interpret_topic(
-    topic_id, keywords, samples, groq_client, mistral_key, gemini_key,
-    paper_count, representative_docs
-) -> TopicInterpretation:
-    prompt = _build_interpretation_prompt(keywords, samples, DEFAULT_TAXONOMY_CATEGORIES)
-    # ------------------------------------------------------------------
-    # Step A: Deterministic non-LLM NOVEL pre-check
-    # If keywords/samples match known NOVEL patterns, override to NOVEL
-    # regardless of what the LLMs say. This is the non-LLM validation
-    # method — uses only regex, no AI.
-    # ------------------------------------------------------------------
-    forced_novel = _is_deterministic_novel(keywords, samples)
-    if forced_novel:
-        logger.info(f"Topic {topic_id}: NOVEL forced by regex trigger on keywords={keywords[:4]}")
-    # ------------------------------------------------------------------
-    # Step B: 3-LLM Council
-    # Call Groq (LLaMA-3.1), Mistral Small, and Gemini 2.5 Flash
-    # independently. Three different providers = three independent votes.
-    # ------------------------------------------------------------------
-    raw_results = []
-    groq_res = _call_llm_json(groq_client, prompt, DEFAULT_MODEL)
-    raw_results.append({"llm": "Groq/LLaMA-3.1", "response": groq_res})
     time.sleep(1)
-    mistral_res = call_mistral_label(prompt, mistral_key)
-    raw_results.append({"llm": "Mistral-Small", "response": mistral_res})
     time.sleep(1)
-    if gemini_key:
-        gemini_res = call_gemini_label(prompt, gemini_key)
-        raw_results.append({"llm": "Gemini-2.5-Flash", "response": gemini_res})
-    results = [r["response"] for r in raw_results]
-    # ------------------------------------------------------------------
-    # Step C: Select best label via majority vote on label text
-    # ------------------------------------------------------------------
-    best = select_best_interpretation(results, keywords)
-    if not best:
-        l, c = _fallback_label_from_keywords(keywords, topic_id)
-        best = {"label": l, "taxonomy_category": c, "classification": "MAPPED"}
-    final_label = _safe_capitalize(best.get("label"))
-    # ------------------------------------------------------------------
-    # Step D: Classification majority vote — separate from label vote
-    # Count NOVEL vs MAPPED votes across all 3 LLMs.
-    # NOVEL wins if: (a) forced by regex OR (b) at least 1 LLM votes NOVEL.
-    # Conservative toward NOVEL because PAJAIS 2019 is outdated and TMIS
-    # publishes many post-2018 techniques with no PAJAIS home.
-    # ------------------------------------------------------------------
-    classification_votes = []
-    for r in results:
-        if r and "classification" in r:
-            v = str(r["classification"]).upper().strip()
-            if v in ("MAPPED", "NOVEL"):
-                classification_votes.append(v)
-    novel_votes = classification_votes.count("NOVEL")
-    mapped_votes = classification_votes.count("MAPPED")
-    # Classification decision logic:
-    # - Regex forced (unambiguous compound NOVEL term in keywords/samples) → always NOVEL
-    # - LLM majority (2 or more of 3 LLMs vote NOVEL) → NOVEL
-    # - Single LLM vote for NOVEL + 2 for MAPPED → MAPPED (majority wins)
-    # - All 3 vote MAPPED → MAPPED
-    # This gives ~40-60% NOVEL as expected for TMIS vs PAJAIS 2019 comparison.
-    if forced_novel or novel_votes >= 2:
-        final_classification = "NOVEL"
-    else:
-        final_classification = "MAPPED"
-    logger.info(
-        f"Topic {topic_id} classification: NOVEL_votes={novel_votes}, "
-        f"MAPPED_votes={mapped_votes}, regex_forced={forced_novel} → {final_classification}"
-    )
-    # ------------------------------------------------------------------
-    # Step E: Build council vote evidence for UI display
-    # Each LLM's label, category, classification, and reasoning is stored
-    # so the UI can show per-topic agreement/disagreement transparently.
-    # ------------------------------------------------------------------
-    council_votes = []
-    for r in raw_results:
-        resp = r["response"]
-        council_votes.append({
-            "llm": r["llm"],
-            "label": clean_label(resp.get("label", "—")) if resp else "—",
-            "category": resp.get("taxonomy_category", "—") if resp else "—",
-            "classification": resp.get("classification", "—") if resp else "—",
-            "reasoning": resp.get("reasoning", "—") if resp else "—",
-        })
-    # ------------------------------------------------------------------
-    # Step F: Regex grounding check on the final label
-    # Verifies the label tokens are grounded in actual cluster keywords.
-    # Catches hallucinated labels (confident-sounding but disconnected
-    # from the underlying data). Pure regex — no AI involved.
-    # ------------------------------------------------------------------
-    regex_validation = validate_label_with_regex(final_label, keywords)
-    logger.info(
-        f"Topic {topic_id} label grounding: {regex_validation['verdict']} "
-        f"(score={regex_validation['overlap_score']}, matched={regex_validation['matched_terms']})"
     )
-    # ------------------------------------------------------------------
-    # Build the final TopicInterpretation object
-    # ------------------------------------------------------------------
-    interp = TopicInterpretation(
-        topic_id=topic_id,
-        label=final_label,
-        category=_safe_capitalize(best.get("taxonomy_category")),
-        classification=final_classification,
-        paper_count=paper_count,
-        keywords=keywords,
-    )
-    # Attach validation evidence as dynamic attributes (serialised manually in run_agent)
-    interp.council_votes         = council_votes
-    interp.regex_validation      = regex_validation
-    interp.novel_forced_by_regex = forced_novel
-    interp.classification_votes  = {"NOVEL": novel_votes, "MAPPED": mapped_votes}
-    return interp
 # ---------------------------------------------------------------------------
-# Run Agent — orchestrates all topics and writes outputs
 # ---------------------------------------------------------------------------
 def run_agent(
-    topic_results,
-    groq_key,
-    mistral_key,
-    gemini_key,
-    output_json="topics.json",
-    output_csv="topics.csv",
 ) -> dict:
     client = build_groq_client(groq_key)
-    res = topic_results["documents"]
-    num_clusters = len([t for t in set(res["topics"]) if t != -1])
-    num_topics   = len(res["topic_keywords"])
-    print(f"Final cluster count: {num_clusters}")
-    print(f"Final topic count:   {num_topics}")
-    if num_clusters != num_topics:
-        logger.error(f"CONSISTENCY WARNING: {num_clusters} clusters != {num_topics} topics")
     interpretations = {}
-    for i, (tid, kw_pairs) in enumerate(res["topic_keywords"].items()):
-        interp = interpret_topic(
-            tid,
-            [w for w, _ in kw_pairs],
-            res["representative_docs"].get(tid, []),
-            client,
-            mistral_key,
-            gemini_key,
-            res["topic_freq"].get(tid, 0),
-            res["representative_docs"].get(tid, []),
         )
-        interpretations[tid] = interp
-        logger.info(f"Interpreted {tid}: {interp.label} [{interp.classification}]")
-    # Build serialisable list — include all validation evidence
-    interp_list = []
-    for i in interpretations.values():
-        d = asdict(i)
-        # asdict() only captures @dataclass fields; add dynamic attributes manually
-        d["council_votes"]         = getattr(i, "council_votes", [])
-        d["regex_validation"]      = getattr(i, "regex_validation", {})
-        d["novel_forced_by_regex"] = getattr(i, "novel_forced_by_regex", False)
-        d["classification_votes"]  = getattr(i, "classification_votes", {})
-        interp_list.append(d)
-    clean_data = convert_numpy_types(interp_list)
     with open(output_json, "w") as f:
-        json.dump(clean_data, f, indent=2)
-    df = pd.DataFrame(clean_data)
     if not df.empty:
-        df["keywords"] = df["keywords"].apply(
-            lambda x: ", ".join(x) if isinstance(x, list) else str(x)
-        )
         df.to_csv(output_csv, index=False)
-    return {
-        "interpretations": interpretations,
-        "json_path": output_json,
-        "csv_path": output_csv,
-    }
-if __name__ == "__main__":
-    pass

 """
 agent.py
 --------
+LLM Council labelling module (§3.5).
+Three independent LLMs label each cluster, producing Sheets 1–3.
+Sheet 4 consolidates with Triple / Two / Single agreement tags.
+Disagreement clusters get a fourth-round defence prompt.
+Labels not grounded in keyphrases are rejected.
 """
 from __future__ import annotations
 import json
 import logging
 import os
+import re
 import time
+from dataclasses import dataclass, field, asdict
 from typing import Optional
 import pandas as pd
+import numpy as np
 import requests
 from groq import Groq
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
+GROQ_MODEL   = "llama-3.1-8b-instant"
+MISTRAL_MODEL = "mistral-small-latest"
+DEFAULT_TAXONOMY = [
+    "Artificial Intelligence", "Machine Learning",
+    "Natural Language Processing", "Computer Vision",
+    "Information Systems", "Healthcare & Bioinformatics",
+    "Finance & Economics", "Cybersecurity",
+    "Human-Computer Interaction", "Robotics & Automation",
+    "Education Technology", "Environmental Science",
     "Social Sciences", "Data Engineering", "Other",
 ]
 # ---------------------------------------------------------------------------
+# Data classes
 # ---------------------------------------------------------------------------
+@dataclass
+class LLMVote:
+    """One LLM's response for one cluster."""
+    llm_name: str
+    label: str = ""
+    description: str = ""
+    pacis_match: str = ""
+    confidence: float = 0.0
+    raw: dict = field(default_factory=dict)
 @dataclass
+class ClusterInterpretation:
+    """Consolidated interpretation for a single cluster."""
+    cluster_id: int
+    final_label: str = ""
+    final_description: str = ""
+    final_pacis_match: str = ""
+    final_confidence: float = 0.0
+    agreement: str = ""          # Triple / Two / Single
+    sheet1: dict = field(default_factory=dict)
+    sheet2: dict = field(default_factory=dict)
+    sheet3: dict = field(default_factory=dict)
+    defence: dict = field(default_factory=dict)  # 4th-round if needed
+    keyphrases: list = field(default_factory=list)
+    strong_count: int = 0
+    weak_count: int = 0
     paper_count: int = 0
+    grounding_check: dict = field(default_factory=dict)
 # ---------------------------------------------------------------------------
+# API Clients
 # ---------------------------------------------------------------------------
 def build_groq_client(api_key: Optional[str] = None):
     key = api_key or os.getenv("GROQ_API_KEY")
     if not key:
+        raise ValueError("No Groq API key.")
     return Groq(api_key=key, max_retries=0)
+def _call_groq(client, prompt: str) -> dict:
     try:
+        r = client.chat.completions.create(
+            model=GROQ_MODEL,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.2, timeout=15,
+        )
+        return _parse_json(r.choices[0].message.content)
     except Exception as e:
+        logger.warning("Groq failed: %s", e)
         return {}
+def _call_mistral(prompt: str, api_key: str) -> dict:
+    if not api_key:
+        return {}
     try:
+        r = requests.post(
             "https://api.mistral.ai/v1/chat/completions",
+            headers={"Authorization": f"Bearer {api_key}",
+                     "Content-Type": "application/json"},
+            json={"model": MISTRAL_MODEL,
+                  "messages": [{"role": "user", "content": prompt}],
+                  "temperature": 0.2},
+            timeout=15,
         )
+        return _parse_json(r.json()["choices"][0]["message"]["content"])
     except Exception as e:
+        logger.warning("Mistral failed: %s", e)
         return {}
+def _call_gemini(prompt: str, api_key: str) -> dict:
+    if not api_key:
+        return {}
+    url = (f"https://generativelanguage.googleapis.com/v1beta/models/"
+           f"gemini-2.5-flash:generateContent?key={api_key}")
     try:
+        r = requests.post(url,
+                          headers={"Content-Type": "application/json"},
+                          json={"contents": [{"parts": [{"text": prompt}]}],
+                                "generationConfig": {"temperature": 0.2}},
+                          timeout=15)
+        data = r.json()
+        if "candidates" not in data:
+            return {}
+        raw = data["candidates"][0]["content"]["parts"][0]["text"]
+        return _parse_json(raw)
     except Exception as e:
+        logger.warning("Gemini failed: %s", e)
         return {}
+def _parse_json(raw: str) -> dict:
+    raw = raw.strip().replace("```json", "").replace("```", "").strip()
+    s, e = raw.find("{"), raw.rfind("}") + 1
+    if s != -1 and e > 0:
+        raw = raw[s:e]
+    try:
+        return json.loads(raw)
+    except Exception:
+        return {}
 # ---------------------------------------------------------------------------
+# Prompt builders
 # ---------------------------------------------------------------------------
+def _build_label_prompt(keyphrases: list, rep_abstracts: list) -> str:
+    kp_str = ", ".join(k if isinstance(k, str) else k[0]
+                       for k in keyphrases[:5])
+    abs_str = " | ".join(a[:300] for a in rep_abstracts[:3])
+    return f"""You are a research-topic classifier.
+A SPECTER-2 + HDBSCAN pipeline produced a topic cluster.
+KEYPHRASES: {kp_str}
+REPRESENTATIVE ABSTRACTS (truncated): {abs_str}
+Return ONLY valid JSON (no markdown, no other text):
+{{
+  "label": "<concise 5-8 word topic label>",
+  "description": "<one-sentence description of the topic>",
+  "pacis_match": "<closest PAJAIS 2019 category, or NOVEL if none>",
+  "confidence": <0.0-1.0 float>
+}}"""
+def _build_defence_prompt(
+    keyphrases: list,
+    rep_abstracts: list,
+    votes: list[dict],
+) -> str:
+    kp_str = ", ".join(k if isinstance(k, str) else k[0]
+                       for k in keyphrases[:5])
+    abs_str = " | ".join(a[:300] for a in rep_abstracts[:3])
+    v_str = "\n".join(
+        f"  LLM {i+1}: label=\"{v.get('label','?')}\", "
+        f"pacis=\"{v.get('pacis_match','?')}\""
+        for i, v in enumerate(votes)
+    )
+    return f"""You are a research-topic adjudicator resolving a labelling disagreement.
+KEYPHRASES: {kp_str}
+REPRESENTATIVE ABSTRACTS: {abs_str}
+Three LLMs proposed different labels:
+{v_str}
+Your task: pick the single best label from the three, or synthesise a
+better one.  Justify your choice in one sentence.
+Return ONLY valid JSON:
 {{
+  "label": "<best 5-8 word label>",
+  "description": "<one sentence>",
+  "pacis_match": "<PAJAIS category or NOVEL>",
+  "confidence": <0.0-1.0>,
+  "reasoning": "<one sentence justification>"
 }}"""
 # ---------------------------------------------------------------------------
+# Grounding check — reject labels not supported by keyphrases (§3.5)
 # ---------------------------------------------------------------------------
+def grounding_check(label: str, keyphrases: list) -> dict:
+    """Non-LLM regex check: label tokens must overlap keyphrases."""
+    if not label or not keyphrases:
+        return {"verdict": "FAIL", "score": 0, "matched": []}
+    label_toks = set(re.findall(r"\b[a-z]{3,}\b", label.lower()))
+    kp_toks = set()
+    for kp in keyphrases:
+        phrase = kp if isinstance(kp, str) else kp[0]
+        kp_toks.update(re.findall(r"\b[a-z]{3,}\b", phrase.lower()))
+    noise = {"the", "and", "for", "with", "using", "based", "from", "that",
+             "are", "this", "into", "its"}
+    label_toks -= noise
+    kp_toks -= noise
+    matched = list(label_toks & kp_toks)
+    # stem-level
+    stems = []
+    for lt in label_toks:
+        for kt in kp_toks:
             if len(lt) >= 4 and (kt.startswith(lt[:4]) or lt.startswith(kt[:4])):
+                stems.append(f"{lt}≈{kt}")
+    score = min(1.0, len(matched) / max(len(label_toks), 1)
+                + 0.15 * len(stems))
+    verdict = "PASS" if (matched or stems) else "FAIL"
+    return {"verdict": verdict, "score": round(score, 3),
+            "matched": matched, "stems": stems[:5]}
 # ---------------------------------------------------------------------------
+# Core — interpret one cluster via 3-LLM council (§3.5)
 # ---------------------------------------------------------------------------
+def interpret_cluster(
+    cluster_id: int,
+    keyphrases: list,
+    rep_docs: list,
+    strong: int,
+    weak: int,
+    groq_client,
+    mistral_key: str,
+    gemini_key: str,
+) -> ClusterInterpretation:
+    prompt = _build_label_prompt(keyphrases, rep_docs)
+    # Sheet 1 — Groq / LLaMA-3.1
+    s1 = _call_groq(groq_client, prompt)
     time.sleep(1)
+    # Sheet 2 — Mistral
+    s2 = _call_mistral(prompt, mistral_key)
     time.sleep(1)
+    # Sheet 3 — Gemini
+    s3 = _call_gemini(prompt, gemini_key)
+    votes = [s1, s2, s3]
+    valid = [v for v in votes if v and "label" in v]
+    # --- Sheet 4: consolidate agreement ---
+    labels_lower = [_clean(v.get("label", "")).lower() for v in valid]
+    counts = {}
+    for l in labels_lower:
+        counts[l] = counts.get(l, 0) + 1
+    best_label = ""
+    agreement = "Single"
+    defence = {}
+    if any(c >= 3 for c in counts.values()):
+        agreement = "Triple"
+        winner = max(counts, key=counts.get)
+        best_label = next(v["label"] for v in valid
+                          if _clean(v["label"]).lower() == winner)
+    elif any(c >= 2 for c in counts.values()):
+        agreement = "Two"
+        winner = max(counts, key=counts.get)
+        best_label = next(v["label"] for v in valid
+                          if _clean(v["label"]).lower() == winner)
+    else:
+        agreement = "Single"
+        # Fourth-round defence prompt (§3.5)
+        defence_prompt = _build_defence_prompt(keyphrases, rep_docs, votes)
+        defence = _call_groq(groq_client, defence_prompt)
+        if defence and "label" in defence:
+            best_label = defence["label"]
+        elif valid:
+            best_label = valid[0]["label"]
+    best_label = _clean(best_label)
+    # Grounding check — reject if not supported by keyphrases
+    gc = grounding_check(best_label, keyphrases)
+    if gc["verdict"] == "FAIL" and valid:
+        # Fall back to most keyphrase-grounded label
+        scored = [(v, len(set(re.findall(r"\b[a-z]{3,}\b",
+                     v.get("label", "").lower()))
+                     & set(re.findall(r"\b[a-z]{3,}\b",
+                     " ".join(k if isinstance(k, str) else k[0]
+                              for k in keyphrases).lower()))))
+                  for v in valid]
+        scored.sort(key=lambda x: -x[1])
+        best_label = _clean(scored[0][0]["label"])
+        gc = grounding_check(best_label, keyphrases)
+        logger.info("Cluster %d: label rejected by grounding, "
+                     "fell back to '%s'", cluster_id, best_label)
+    # Best metadata
+    best_v = next((v for v in valid
+                   if _clean(v.get("label", "")).lower()
+                   == best_label.lower()), valid[0] if valid else {})
+    return ClusterInterpretation(
+        cluster_id=cluster_id,
+        final_label=best_label,
+        final_description=best_v.get("description", ""),
+        final_pacis_match=best_v.get("pacis_match", ""),
+        final_confidence=best_v.get("confidence", 0.0),
+        agreement=agreement,
+        sheet1=s1, sheet2=s2, sheet3=s3,
+        defence=defence,
+        keyphrases=[k if isinstance(k, str) else k[0]
+                    for k in keyphrases[:5]],
+        strong_count=strong,
+        weak_count=weak,
+        paper_count=strong + weak,
+        grounding_check=gc,
     )
+def _clean(s: str) -> str:
+    s = str(s or "").replace("\n", " ").strip()
+    s = " ".join(s.split())
+    if len(s) > 60:
+        s = s[:60].rsplit(" ", 1)[0] if " " in s[:60] else s[:60]
+    return s.rstrip(" .")
 # ---------------------------------------------------------------------------
+# Numpy-safe serialisation
+# ---------------------------------------------------------------------------
+def _convert(obj):
+    if isinstance(obj, dict):
+        return {k: _convert(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_convert(v) for v in obj]
+    if isinstance(obj, (np.integer,)):
+        return int(obj)
+    if isinstance(obj, (np.floating,)):
+        return float(obj)
+    return obj
+# ---------------------------------------------------------------------------
+# Run agent — orchestrate all clusters
 # ---------------------------------------------------------------------------
 def run_agent(
+    topic_results: dict,
+    groq_key: str,
+    mistral_key: str,
+    gemini_key: str,
+    output_json: str = "topics.json",
+    output_csv: str = "topics.csv",
 ) -> dict:
     client = build_groq_client(groq_key)
+    labels_list   = topic_results["labels"]
+    keyphrases    = topic_results["keyphrases"]
+    rep_docs      = topic_results["representative_docs"]
+    membership    = topic_results["membership"]
+    cluster_ids = sorted(keyphrases.keys())
     interpretations = {}
+    for cid in cluster_ids:
+        sw = membership.get(cid, {"strong": 0, "weak": 0})
+        interp = interpret_cluster(
+            cluster_id=cid,
+            keyphrases=keyphrases.get(cid, []),
+            rep_docs=rep_docs.get(cid, []),
+            strong=sw["strong"],
+            weak=sw["weak"],
+            groq_client=client,
+            mistral_key=mistral_key,
+            gemini_key=gemini_key,
         )
+        interpretations[cid] = interp
+        logger.info("Cluster %d → %s [%s] (%d strong, %d weak)",
+                    cid, interp.final_label, interp.agreement,
+                    interp.strong_count, interp.weak_count)
+    # Serialise
+    records = [_convert(asdict(i)) for i in interpretations.values()]
     with open(output_json, "w") as f:
+        json.dump(records, f, indent=2)
+    df = pd.DataFrame(records)
     if not df.empty:
+        for col in ["sheet1", "sheet2", "sheet3", "defence",
+                     "keyphrases", "grounding_check"]:
+            if col in df.columns:
+                df[col] = df[col].apply(str)
         df.to_csv(output_csv, index=False)
+    return dict(interpretations=interpretations,
+                json_path=output_json, csv_path=output_csv)