BERTopic_AG_final

Running

App Files Files Community

BHAVIKBANKER commited on 13 days ago

Commit

64049b0

verified ·

1 Parent(s): 853e1a5

Update agent.py

Browse files

Files changed (1) hide show

agent.py +611 -158

agent.py CHANGED Viewed

@@ -1,7 +1,12 @@
 """
 agent.py — LangGraph-based topic analysis agent (§11).
-3-LLM Council for topic modelling + methodology extraction.
-Methodology pipeline: Regex pre-scan → Groq → Mistral → Gemini → consolidation.
 """
 from __future__ import annotations
 import json, logging, os, re, time
@@ -17,9 +22,9 @@ logger = logging.getLogger(__name__)
 GROQ_MODEL    = "llama-3.1-8b-instant"
 MISTRAL_MODEL = "mistral-small-latest"
-# ---------------------------------------------------------------------------
-# Regex pattern banks — transparent extraction shown in UI
-# ---------------------------------------------------------------------------
 METHODOLOGY_PATTERNS = {
     "Survey / Systematic Review":  re.compile(
         r"\b(survey|systematic\s+review|literature\s+review|bibliometric|scoping\s+review|meta.?analysis)\b", re.I),
@@ -66,6 +71,12 @@ TECHNIQUE_PATTERNS = {
         r"\b(reinforcement\s+learning|Q.learning|policy\s+gradient|reward\s+function|Markov\s+decision)\b", re.I),
     "Cloud / Big Data":            re.compile(
         r"\b(cloud\s+computing|Hadoop|Spark|MapReduce|big\s+data|distributed\s+computing|edge\s+computing)\b", re.I),
 }
 ORIENTATION_PATTERNS = {
@@ -74,13 +85,30 @@ ORIENTATION_PATTERNS = {
     "mixed":       re.compile(r"\b(mixed\s+method|qualitative.+quantitative|both|triangulat)\b", re.I),
 }
 def _regex_scan(docs: list[str]) -> dict:
-    """
-    Run all pattern banks against a list of documents.
-    Returns hits per category with matched spans so the UI can show exactly
-    which text triggered each label.
-    """
     method_hits    = defaultdict(list)
     technique_hits = defaultdict(list)
     orientation_counts = {"empirical": 0, "theoretical": 0, "mixed": 0}
@@ -88,18 +116,19 @@ def _regex_scan(docs: list[str]) -> dict:
     for doc_idx, doc in enumerate(docs):
         for label, pat in METHODOLOGY_PATTERNS.items():
             for m in pat.finditer(doc):
-                method_hits[label].append({"doc": doc_idx + 1, "match": m.group(0),
-                                           "span": [m.start(), m.end()]})
         for label, pat in TECHNIQUE_PATTERNS.items():
             for m in pat.finditer(doc):
-                technique_hits[label].append({"doc": doc_idx + 1, "match": m.group(0),
-                                              "span": [m.start(), m.end()]})
         for orient, pat in ORIENTATION_PATTERNS.items():
             if pat.search(doc):
                 orientation_counts[orient] += 1
     total_orient = sum(orientation_counts.values()) or 1
     return {
         "methods":    {k: v for k, v in method_hits.items()    if v},
         "techniques": {k: v for k, v in technique_hits.items() if v},
@@ -111,33 +140,42 @@ def _regex_scan(docs: list[str]) -> dict:
         "patterns_applied": {
             "methodology": list(METHODOLOGY_PATTERNS.keys()),
             "technique":   list(TECHNIQUE_PATTERNS.keys()),
-            "orientation": list(ORIENTATION_PATTERNS.keys()),
         },
     }
 def _regex_summary(scan: dict) -> str:
-    """Human-readable summary of regex hits — injected into LLM prompt as evidence."""
     lines = []
     if scan["methods"]:
         lines.append("REGEX-DETECTED METHODOLOGIES:")
         for k, hits in scan["methods"].items():
-            unique_matches = list(dict.fromkeys(h["match"] for h in hits))[:3]
             papers = sorted({h["doc"] for h in hits})
-            lines.append(f"  • {k} — matched: {unique_matches} (papers: {papers})")
     if scan["techniques"]:
         lines.append("REGEX-DETECTED TECHNIQUES:")
         for k, hits in scan["techniques"].items():
-            unique_matches = list(dict.fromkeys(h["match"] for h in hits))[:3]
             papers = sorted({h["doc"] for h in hits})
-            lines.append(f"  • {k} — matched: {unique_matches} (papers: {papers})")
-    return "\n".join(lines) or "No regex hits found — rely on abstracts alone."
-# ---------------------------------------------------------------------------
-# LangGraph state
-# ---------------------------------------------------------------------------
 class PipelineState(TypedDict, total=False):
     filepath: str
     groq_key: str
     mistral_key: str
@@ -154,11 +192,18 @@ class PipelineState(TypedDict, total=False):
     refinement_log: list
     json_path: str
     error: str
-# ---------------------------------------------------------------------------
-# API helpers
-# ---------------------------------------------------------------------------
 def _parse(raw: str) -> dict:
     raw = raw.strip().replace("```json","").replace("```","").strip()
     s, e = raw.find("{"), raw.rfind("}")+1
@@ -169,7 +214,7 @@ def _parse(raw: str) -> dict:
 def _groq(client, prompt):
     try:
         r = client.chat.completions.create(model=GROQ_MODEL,
-            messages=[{"role":"user","content":prompt}], temperature=0.2, timeout=15)
         return _parse(r.choices[0].message.content)
     except Exception as e: logger.warning("Groq: %s", e); return {}
@@ -179,7 +224,7 @@ def _mistral(prompt, key):
         r = requests.post("https://api.mistral.ai/v1/chat/completions",
             headers={"Authorization":f"Bearer {key}","Content-Type":"application/json"},
             json={"model":MISTRAL_MODEL,"messages":[{"role":"user","content":prompt}],
-                  "temperature":0.2}, timeout=15)
         return _parse(r.json()["choices"][0]["message"]["content"])
     except Exception as e: logger.warning("Mistral: %s", e); return {}
@@ -200,7 +245,8 @@ def _gemini(prompt, key):
                 msg = err.get("message","") if isinstance(err,dict) else str(err)
                 if "quota" in msg.lower() or "rate" in msg.lower():
                     wait = min(40, 10*(attempt+1))
-                    logger.warning("Gemini rate-limited, waiting %ds…", wait); time.sleep(wait); continue
                 logger.warning("Gemini attempt %d: %s", attempt+1, msg); return {}
             return _parse(d["candidates"][0]["content"]["parts"][0]["text"])
         except Exception as e:
@@ -208,9 +254,9 @@ def _gemini(prompt, key):
     return {}
-# ---------------------------------------------------------------------------
-# Topic labelling prompts
-# ---------------------------------------------------------------------------
 def _label_prompt(keyphrases, rep_docs):
     kp = ", ".join(k[0] if isinstance(k,tuple) else k for k in keyphrases[:5])
     ab = " | ".join(a[:250] for a in rep_docs[:3])
@@ -235,10 +281,6 @@ Votes:\n{v_str}
 Pick the best label or synthesise a better one.
 Return ONLY JSON: {{"label":"...","description":"...","pacis_match":"...","confidence":0.0}}"""
-# ---------------------------------------------------------------------------
-# Methodology prompt — seeded with regex evidence
-# ---------------------------------------------------------------------------
 def _methodology_prompt(label: str, rep_docs: list[str], regex_summary: str) -> str:
     ab = "\n\n".join(f"Paper {i+1}: {d[:500]}" for i,d in enumerate(rep_docs[:3]))
     return f"""You are a research methodology auditor for the cluster: "{label}".
@@ -274,42 +316,131 @@ Return ONLY valid JSON:
   "regex_rejected":  ["<label2>"]
 }}"""
-# ---------------------------------------------------------------------------
-# Consolidate 3-LLM methodology responses
-# ---------------------------------------------------------------------------
-def _consolidate_methodology(r1: dict, r2: dict, r3: dict, regex_scan: dict) -> dict:
     """
-    Merge Groq + Mistral + Gemini methodology responses.
-    Rule: a method/technique survives only when ≥2 LLMs named it.
-    Percentage = average across agreeing LLMs.
     """
     def _name_map(r, key):
         return {item["name"].strip().lower(): item for item in r.get(key, [])}
     def _merge_items(key):
-        maps    = [_name_map(r, key) for r in [r1, r2, r3]]
         all_keys = set().union(*[m.keys() for m in maps])
         accepted, rejected = [], []
         for k in all_keys:
-            voters   = [m[k] for m in maps if k in m]
-            n_votes  = len(voters)
-            avg_pct  = round(sum(v.get("pct",0) for v in voters) / n_votes)
-            papers   = sorted({p for v in voters for p in v.get("papers",[])})
-            evidence = next((v.get("evidence","") for v in voters if v.get("evidence")), "")
-            row = {
-                "name":      voters[0]["name"],
-                "pct":       avg_pct,
-                "papers":    papers,
-                "evidence":  evidence,
-                "llm_votes": n_votes,
-                "agreement": "Triple" if n_votes==3 else "Two" if n_votes==2 else "Single",
-            }
             (accepted if n_votes >= 2 else rejected).append(row)
         return (sorted(accepted, key=lambda x: -x["pct"]),
                 sorted(rejected, key=lambda x: -x["pct"]))
-    methods_acc,   methods_rej   = _merge_items("methodologies")
     techniques_acc, techniques_rej = _merge_items("techniques")
     emp_avg  = round(sum(r.get("empirical_pct",  0) for r in [r1,r2,r3]) / 3)
@@ -318,20 +449,15 @@ def _consolidate_methodology(r1: dict, r2: dict, r3: dict, regex_scan: dict) ->
     confirmed_votes = Counter(item for r in [r1,r2,r3] for item in r.get("regex_confirmed",[]))
     rejected_votes  = Counter(item for r in [r1,r2,r3] for item in r.get("regex_rejected",[]))
     dom_m = Counter(r.get("dominant_method","")    for r in [r1,r2,r3] if r).most_common(1)
     dom_t = Counter(r.get("dominant_technique","") for r in [r1,r2,r3] if r).most_common(1)
     return {
-        "methodologies":      methods_acc,
-        "techniques":         techniques_acc,
-        "rejected_methods":   methods_rej,
-        "rejected_techniques":techniques_rej,
         "dominant_method":    dom_m[0][0] if dom_m else "—",
         "dominant_technique": dom_t[0][0] if dom_t else "—",
-        "empirical_pct":      emp_avg,
-        "theoretical_pct":    theo_avg,
-        "mixed_pct":          mix_avg,
         "regex_confirmed_consensus": [k for k,v in confirmed_votes.items() if v>=2],
         "regex_rejected_consensus":  [k for k,v in rejected_votes.items()  if v>=2],
         "llm_raw": {"groq": r1, "mistral": r2, "gemini": r3},
@@ -339,32 +465,70 @@ def _consolidate_methodology(r1: dict, r2: dict, r3: dict, regex_scan: dict) ->
     }
-# ---------------------------------------------------------------------------
-# Critic prompt for optimization loop
-# ---------------------------------------------------------------------------
-def _critic_prompt(label, description, keyphrases, rep_docs):
-    kp = ", ".join(k[0] if isinstance(k,tuple) else k for k in keyphrases[:5])
-    ab = " | ".join(d[:300] for d in rep_docs[:3])
-    return f"""You are a strict quality auditor for research topic labels.
-CURRENT LABEL: "{label}"
-CURRENT DESCRIPTION: "{description}"
-KEYPHRASES: {kp}
-REPRESENTATIVE ABSTRACTS: {ab}
-Audit for: hallucination, vagueness, keyphrase alignment, specificity.
-Return ONLY valid JSON:
-{{
-  "refined_label": "<improved 5-8 word label>",
-  "refined_description": "<one sentence>",
-  "hallucination_detected": true/false,
-  "issues": ["<issue1>"],
-  "improvement_score": <0.0-1.0>,
-  "confidence": <0.0-1.0>
-}}"""
-# ---------------------------------------------------------------------------
-# Grounding check
-# ---------------------------------------------------------------------------
 def _grounding(label, keyphrases):
     if not label or not keyphrases: return {"verdict":"FAIL","score":0}
     lt = set(re.findall(r"\b[a-z]{3,}\b", label.lower()))
@@ -382,9 +546,9 @@ def _clean(s):
     return s[:60].rsplit(" ",1)[0] if len(s)>60 else s
-# ---------------------------------------------------------------------------
-# Node: embed + cluster
-# ---------------------------------------------------------------------------
 def embed_and_cluster(state: PipelineState) -> dict:
     from tools import run_topic_modeling
     try:
@@ -394,9 +558,6 @@ def embed_and_cluster(state: PipelineState) -> dict:
         return {"error": str(e)}
-# ---------------------------------------------------------------------------
-# Node: LLM Council (labels)
-# ---------------------------------------------------------------------------
 def llm_council(state: PipelineState) -> dict:
     td = state["topic_data"]
     if not td: return {"error": "No topic data"}
@@ -454,7 +615,6 @@ def llm_council(state: PipelineState) -> dict:
             "description":best.get("description",""),
             "pacis_match":best.get("pacis_match",""),
             "keyphrases":[k[0] if isinstance(k,tuple) else k for k in kps[:5]]}
         logger.info("Cluster %d → %s [%s]", cid, label, agreement)
     total    = len(sheets[4]) or 1
@@ -471,14 +631,10 @@ def llm_council(state: PipelineState) -> dict:
         pd.DataFrame(sheets[sn]).to_csv(path, index=False)
         sheet_paths[sn] = path
     with open("topics.json","w") as f: json.dump(sheets[4], f, indent=2)
     return {"interpretations":interps,"sheets":sheets,
             "agreement_rates":rates,"sheet_paths":sheet_paths,"json_path":"topics.json"}
-# ---------------------------------------------------------------------------
-# Node: optimization / refinement loop
-# ---------------------------------------------------------------------------
 def optimization_loop(state: PipelineState) -> dict:
     n_opt = state.get("n_optimize", 1)
     if n_opt <= 1:
@@ -493,7 +649,6 @@ def optimization_loop(state: PipelineState) -> dict:
     for iteration in range(n_opt - 1):
         iter_num = iteration + 2
         logger.info("Optimization iteration %d / %d", iter_num, n_opt)
         for cid in sorted(interps.keys()):
             kps           = td["keyphrases"].get(cid, [])
             rds           = td["representative_docs"].get(cid, [])
@@ -527,29 +682,11 @@ def optimization_loop(state: PipelineState) -> dict:
     for cid, interp in interps.items():
         if cid in label_map:
             label_map[cid]["label"] = interp["label"]
     return {"interpretations":interps,"sheets":sheets,"refinement_log":refinement_log}
-# ---------------------------------------------------------------------------
-# Node: 3-LLM methodology council + regex pre-scan
-# ---------------------------------------------------------------------------
 def extract_methodology(state: PipelineState) -> dict:
-    """
-    Per cluster:
-      1. Run METHODOLOGY_PATTERNS + TECHNIQUE_PATTERNS regex banks against
-         representative abstracts — produces ground-truth evidence with exact
-         match spans that are surfaced in the UI.
-      2. Build a human-readable regex summary and inject it into the LLM prompt
-         as grounding evidence.
-      3. Call Groq, Mistral, and Gemini with the same prompt — each LLM must
-         confirm or reject the regex hits, and may add anything it finds in
-         the full abstract text.
-      4. Consolidate: only methods/techniques agreed by ≥2 LLMs survive.
-         Percentages are averaged across agreeing LLMs.
-      5. Store full trace: regex_scan, per-LLM raw responses, consolidation
-         result — all exposed in the UI extraction pipeline tab.
-    """
     td      = state["topic_data"]
     interps = state.get("interpretations", {})
     client  = Groq(api_key=state["groq_key"], max_retries=0)
@@ -559,41 +696,25 @@ def extract_methodology(state: PipelineState) -> dict:
     for cid in sorted(td["keyphrases"].keys()):
         rds   = td["representative_docs"].get(cid, [])
         label = interps.get(cid, {}).get("label", f"Cluster {cid}")
-        # Step 1 — regex pre-scan
         scan       = _regex_scan(rds)
         regex_hint = _regex_summary(scan)
         logger.info("Cluster %d regex: %d method hits, %d technique hits",
                     cid, len(scan["methods"]), len(scan["techniques"]))
-        # Step 2 — all 3 LLMs with regex evidence in prompt
         prompt = _methodology_prompt(label, rds, regex_hint)
         r1 = _groq(client, prompt);    time.sleep(1)
         r2 = _mistral(prompt, mk);     time.sleep(1)
         r3 = _gemini(prompt, gk);      time.sleep(4)
-        logger.info("Cluster %d methodology votes — Groq:%s Mistral:%s Gemini:%s",
-                    cid, bool(r1.get("methodologies")),
-                    bool(r2.get("methodologies")), bool(r3.get("methodologies")))
-        # Step 3 — consolidate with ≥2 LLM agreement rule
         consolidated = _consolidate_methodology(r1, r2, r3, scan)
         methodology_data[cid] = consolidated
         logger.info("Cluster %d → dom_method: %s | dom_tech: %s",
                     cid, consolidated["dominant_method"], consolidated["dominant_technique"])
     return {"methodology_data": methodology_data}
-# ---------------------------------------------------------------------------
-# Node: top-3 papers
-# ---------------------------------------------------------------------------
 def collect_top_papers(state: PipelineState) -> dict:
     td      = state["topic_data"]
     interps = state.get("interpretations", {})
     top_papers = {}
     for cid in sorted(interps.keys()):
         rds   = td["representative_docs"].get(cid, [])
         label = interps.get(cid, {}).get("label", f"Cluster {cid}")
@@ -601,17 +722,12 @@ def collect_top_papers(state: PipelineState) -> dict:
         for rank, doc in enumerate(rds[:3], start=1):
             title_part    = doc.split(". ")[0][:120] if ". " in doc else doc[:120]
             abstract_part = doc[len(title_part):].strip(". ")[:400]
-            papers.append({"rank":rank,"title":title_part,
-                           "abstract_snippet":abstract_part,
                            "cluster":cid,"cluster_label":label})
         top_papers[cid] = papers
     return {"top_papers": top_papers}
-# ---------------------------------------------------------------------------
-# Node: mismatch table
-# ---------------------------------------------------------------------------
 def build_mismatch(state: PipelineState) -> dict:
     from tools import build_mismatch_table
     td         = state["topic_data"]
@@ -620,35 +736,372 @@ def build_mismatch(state: PipelineState) -> dict:
     return {"mismatch_table": build_mismatch_table(td["keyphrases"], labels_map)}
-# ---------------------------------------------------------------------------
-# Build LangGraph
-# ---------------------------------------------------------------------------
 def build_graph() -> StateGraph:
     g = StateGraph(PipelineState)
     g.add_node("embed_and_cluster",   embed_and_cluster)
     g.add_node("llm_council",         llm_council)
     g.add_node("optimization_loop",   optimization_loop)
     g.add_node("extract_methodology", extract_methodology)
     g.add_node("collect_top_papers",  collect_top_papers)
     g.add_node("build_mismatch",      build_mismatch)
     g.set_entry_point("embed_and_cluster")
     g.add_edge("embed_and_cluster",   "llm_council")
     g.add_edge("llm_council",         "optimization_loop")
     g.add_edge("optimization_loop",   "extract_methodology")
     g.add_edge("extract_methodology", "collect_top_papers")
     g.add_edge("collect_top_papers",  "build_mismatch")
-    g.add_edge("build_mismatch",      END)
     return g.compile()
 pipeline_graph = build_graph()
 def run_pipeline(filepath, groq_key, mistral_key, gemini_key,
-                 n_trials=50, n_optimize=1):
     return pipeline_graph.invoke({
-        "filepath":    filepath,
-        "groq_key":   groq_key,
-        "mistral_key": mistral_key,
-        "gemini_key":  gemini_key,
-        "n_trials":   n_trials,
-        "n_optimize":  n_optimize,
     })

 """
 agent.py — LangGraph-based topic analysis agent (§11).
+Original 3-LLM Council for topic modelling is UNCHANGED.
+NEW nodes appended:
+  - load_methodology_corpus     : load methodology CSV, detect journal per paper
+  - embed_methodology_vectors   : SPECTER-2 embed methodology text (separate vector space)
+  - extract_comp_techniques     : 3-LLM council (regex → Groq → Mistral → Gemini → consolidate)
+  - build_journal_crosstab      : technique × journal cross-tabulation with percentages
+  - optimize_technique_labels   : improvement / hallucination critique on consolidated techniques
 """
 from __future__ import annotations
 import json, logging, os, re, time
 GROQ_MODEL    = "llama-3.1-8b-instant"
 MISTRAL_MODEL = "mistral-small-latest"
+# ============================================================================
+# REGEX BANKS (used in both cluster methodology AND methodology-CSV pipeline)
+# ============================================================================
 METHODOLOGY_PATTERNS = {
     "Survey / Systematic Review":  re.compile(
         r"\b(survey|systematic\s+review|literature\s+review|bibliometric|scoping\s+review|meta.?analysis)\b", re.I),
         r"\b(reinforcement\s+learning|Q.learning|policy\s+gradient|reward\s+function|Markov\s+decision)\b", re.I),
     "Cloud / Big Data":            re.compile(
         r"\b(cloud\s+computing|Hadoop|Spark|MapReduce|big\s+data|distributed\s+computing|edge\s+computing)\b", re.I),
+    "Structural Equation Modelling": re.compile(
+        r"\b(structural\s+equation|SEM|PLS.SEM|covariance.based|CB.SEM|partial\s+least\s+squares)\b", re.I),
+    "Time Series / VAR":           re.compile(
+        r"\b(time\s+series|VAR\b|vector\s+auto.?regression|VARX|ARIMA|impulse\s+response|Granger)\b", re.I),
+    "Content Analysis / Coding":   re.compile(
+        r"\b(content\s+analysis|coding\s+scheme|thematic\s+analys|grounded\s+theory|open\s+coding|axial\s+coding)\b", re.I),
 }
 ORIENTATION_PATTERNS = {
     "mixed":       re.compile(r"\b(mixed\s+method|qualitative.+quantitative|both|triangulat)\b", re.I),
 }
+# Journal detection patterns applied to DOI + title
+JOURNAL_PATTERNS = {
+    "MISQ":   re.compile(
+        r"(misq|mis\s*quarterly|10\.25300|10\.2307/[0-9]{8}|MIS\s+Quarterly)", re.I),
+    "JAIS":   re.compile(
+        r"(jais|10\.17705/1jais|journal.*association.*information\s+systems)", re.I),
+    "ISR":    re.compile(
+        r"(10\.1287/isre|\bisr\b|information\s+systems\s+research)", re.I),
+    "JMIS":   re.compile(
+        r"(10\.1080/07421222|jmis|journal.*management.*information\s+systems)", re.I),
+    "PAJAIS": re.compile(
+        r"(pajais|pacific.*asia.*information|10\.17705/2asfp)", re.I),
+    "ECIS":   re.compile(
+        r"(ecis|european.*conference.*information\s+systems)", re.I),
+    "ICIS":   re.compile(
+        r"(icis|international.*conference.*information\s+systems)", re.I),
+}
+# ============================================================================
+# SHARED REGEX HELPERS
+# ============================================================================
 def _regex_scan(docs: list[str]) -> dict:
+    """Run pattern banks against docs. Returns hit dicts with exact match spans."""
     method_hits    = defaultdict(list)
     technique_hits = defaultdict(list)
     orientation_counts = {"empirical": 0, "theoretical": 0, "mixed": 0}
     for doc_idx, doc in enumerate(docs):
         for label, pat in METHODOLOGY_PATTERNS.items():
             for m in pat.finditer(doc):
+                method_hits[label].append({
+                    "doc": doc_idx + 1, "match": m.group(0),
+                    "span": [m.start(), m.end()]})
         for label, pat in TECHNIQUE_PATTERNS.items():
             for m in pat.finditer(doc):
+                technique_hits[label].append({
+                    "doc": doc_idx + 1, "match": m.group(0),
+                    "span": [m.start(), m.end()]})
         for orient, pat in ORIENTATION_PATTERNS.items():
             if pat.search(doc):
                 orientation_counts[orient] += 1
     total_orient = sum(orientation_counts.values()) or 1
     return {
         "methods":    {k: v for k, v in method_hits.items()    if v},
         "techniques": {k: v for k, v in technique_hits.items() if v},
         "patterns_applied": {
             "methodology": list(METHODOLOGY_PATTERNS.keys()),
             "technique":   list(TECHNIQUE_PATTERNS.keys()),
         },
     }
 def _regex_summary(scan: dict) -> str:
+    """Human-readable regex evidence injected into LLM prompts."""
     lines = []
     if scan["methods"]:
         lines.append("REGEX-DETECTED METHODOLOGIES:")
         for k, hits in scan["methods"].items():
+            unique = list(dict.fromkeys(h["match"] for h in hits))[:3]
             papers = sorted({h["doc"] for h in hits})
+            lines.append(f"  • {k} — matched: {unique} (papers: {papers})")
     if scan["techniques"]:
         lines.append("REGEX-DETECTED TECHNIQUES:")
         for k, hits in scan["techniques"].items():
+            unique = list(dict.fromkeys(h["match"] for h in hits))[:3]
             papers = sorted({h["doc"] for h in hits})
+            lines.append(f"  • {k} — matched: {unique} (papers: {papers})")
+    return "\n".join(lines) or "No regex hits found — rely on methodology text alone."
+def _detect_journal(doi: str, title: str) -> str:
+    """Detect journal from DOI + title using JOURNAL_PATTERNS. Returns 'Other' if unknown."""
+    text = f"{doi or ''} {title or ''}"
+    for journal, pat in JOURNAL_PATTERNS.items():
+        if pat.search(text):
+            return journal
+    return "MISQ"   # methodology CSV default — override downstream if needed
+# ============================================================================
+# LANGGRAPH STATE
+# ============================================================================
 class PipelineState(TypedDict, total=False):
+    # ── original fields (DO NOT CHANGE) ──────────────────────────────────────
     filepath: str
     groq_key: str
     mistral_key: str
     refinement_log: list
     json_path: str
     error: str
+    # ── new fields for methodology-CSV pipeline ───────────────────────────────
+    methodology_filepath: str          # uploaded methodology CSV path
+    methodology_papers: list           # [{title, doi, methodology, journal, paper_idx}]
+    methodology_embeddings: list       # SPECTER-2 embeddings (separate vector space)
+    comp_technique_sheets: dict        # {1:Groq, 2:Mistral, 3:Gemini, 4:Consolidated}
+    journal_crosstab: dict             # {journal: {technique: pct}}
+    technique_opt_log: list            # improvement suggestions from optimizer
+# ============================================================================
+# API HELPERS (unchanged)
+# ============================================================================
 def _parse(raw: str) -> dict:
     raw = raw.strip().replace("```json","").replace("```","").strip()
     s, e = raw.find("{"), raw.rfind("}")+1
 def _groq(client, prompt):
     try:
         r = client.chat.completions.create(model=GROQ_MODEL,
+            messages=[{"role":"user","content":prompt}], temperature=0.2, timeout=30)
         return _parse(r.choices[0].message.content)
     except Exception as e: logger.warning("Groq: %s", e); return {}
         r = requests.post("https://api.mistral.ai/v1/chat/completions",
             headers={"Authorization":f"Bearer {key}","Content-Type":"application/json"},
             json={"model":MISTRAL_MODEL,"messages":[{"role":"user","content":prompt}],
+                  "temperature":0.2}, timeout=30)
         return _parse(r.json()["choices"][0]["message"]["content"])
     except Exception as e: logger.warning("Mistral: %s", e); return {}
                 msg = err.get("message","") if isinstance(err,dict) else str(err)
                 if "quota" in msg.lower() or "rate" in msg.lower():
                     wait = min(40, 10*(attempt+1))
+                    logger.warning("Gemini rate-limited, waiting %ds…", wait)
+                    time.sleep(wait); continue
                 logger.warning("Gemini attempt %d: %s", attempt+1, msg); return {}
             return _parse(d["candidates"][0]["content"]["parts"][0]["text"])
         except Exception as e:
     return {}
+# ============================================================================
+# ORIGINAL PROMPTS (unchanged)
+# ============================================================================
 def _label_prompt(keyphrases, rep_docs):
     kp = ", ".join(k[0] if isinstance(k,tuple) else k for k in keyphrases[:5])
     ab = " | ".join(a[:250] for a in rep_docs[:3])
 Pick the best label or synthesise a better one.
 Return ONLY JSON: {{"label":"...","description":"...","pacis_match":"...","confidence":0.0}}"""
 def _methodology_prompt(label: str, rep_docs: list[str], regex_summary: str) -> str:
     ab = "\n\n".join(f"Paper {i+1}: {d[:500]}" for i,d in enumerate(rep_docs[:3]))
     return f"""You are a research methodology auditor for the cluster: "{label}".
   "regex_rejected":  ["<label2>"]
 }}"""
+def _critic_prompt(label, description, keyphrases, rep_docs):
+    kp = ", ".join(k[0] if isinstance(k,tuple) else k for k in keyphrases[:5])
+    ab = " | ".join(d[:300] for d in rep_docs[:3])
+    return f"""You are a strict quality auditor for research topic labels.
+CURRENT LABEL: "{label}"
+CURRENT DESCRIPTION: "{description}"
+KEYPHRASES: {kp}
+REPRESENTATIVE ABSTRACTS: {ab}
+Audit for: hallucination, vagueness, keyphrase alignment, specificity.
+Return ONLY valid JSON:
+{{
+  "refined_label": "<improved 5-8 word label>",
+  "refined_description": "<one sentence>",
+  "hallucination_detected": true/false,
+  "issues": ["<issue1>"],
+  "improvement_score": <0.0-1.0>,
+  "confidence": <0.0-1.0>
+}}"""
+# ============================================================================
+# NEW: COMPUTATIONAL TECHNIQUE PROMPTS
+# ============================================================================
+def _comp_technique_batch_prompt(papers: list[dict], regex_hint: str) -> str:
     """
+    Prompt fed to each LLM for a batch of methodology-CSV papers.
+    Papers have keys: paper_idx, title, journal, methodology (text).
+    regex_hint is the pre-scanned regex evidence for this batch.
     """
+    batch_text = "\n\n".join(
+        f"PAPER {p['paper_idx']} [{p['journal']}] — {p['title'][:100]}\n"
+        f"METHODOLOGY TEXT: {p['methodology'][:800]}"
+        for p in papers
+    )
+    paper_ids = [p['paper_idx'] for p in papers]
+    return f"""You are a computational technique extractor for IS research papers.
+REGEX PRE-SCAN (ground truth hints from pattern matching):
+{regex_hint}
+PAPERS:
+{batch_text}
+For EACH paper listed above ({paper_ids}), identify the computational techniques used.
+A computational technique must be explicitly mentioned or clearly implied in the text.
+Do NOT hallucinate — if a paper uses no computational technique, return empty list.
+Also for each technique found across ALL papers, compute what percentage of papers in this
+batch use that technique.
+Return ONLY valid JSON:
+{{
+  "per_paper": {{
+    "<paper_idx>": {{
+      "techniques": ["<technique1>", "<technique2>"],
+      "evidence":   ["<≤12 word quote1>", "<≤12 word quote2>"],
+      "confidence": <0.0-1.0>
+    }}
+  }},
+  "batch_technique_pct": {{
+    "<technique_name>": <percentage_of_papers_in_batch_0-100>
+  }},
+  "dominant_technique": "<most common technique in batch>",
+  "no_technique_papers": [<paper_idxs with no clear computational technique>]
+}}"""
+def _technique_critique_prompt(technique: str, journal: str, pct_groq: float,
+                                pct_mistral: float, pct_gemini: float,
+                                evidence_samples: list[str]) -> str:
+    """Optimization critic for a single consolidated technique label."""
+    ev = " | ".join(evidence_samples[:3])
+    return f"""You are a research technique label auditor.
+TECHNIQUE: "{technique}"
+JOURNAL: {journal}
+GROQ extracted it in {pct_groq:.0f}% of papers
+MISTRAL extracted it in {pct_mistral:.0f}% of papers
+GEMINI extracted it in {pct_gemini:.0f}% of papers
+EVIDENCE QUOTES: {ev}
+Audit:
+1. Is the technique name precise and not hallucinated?
+2. Is there inter-LLM disagreement (>15% gap) suggesting ambiguity?
+3. Should this be split into sub-techniques or merged with another?
+4. Suggest a refined canonical name if needed.
+Return ONLY valid JSON:
+{{
+  "refined_name": "<canonical technique name or same if fine>",
+  "is_hallucination": true/false,
+  "high_variance_across_llms": true/false,
+  "suggestion": "<one sentence improvement recommendation>",
+  "split_into": ["<sub-tech1>", "<sub-tech2>"],
+  "merge_with": "<other technique name or null>",
+  "confidence": <0.0-1.0>
+}}"""
+# ============================================================================
+# CONSOLIDATION HELPERS (original + new)
+# ============================================================================
+def _consolidate_methodology(r1: dict, r2: dict, r3: dict, regex_scan: dict) -> dict:
+    """Merge Groq + Mistral + Gemini methodology responses. ≥2 LLM gate."""
     def _name_map(r, key):
         return {item["name"].strip().lower(): item for item in r.get(key, [])}
     def _merge_items(key):
+        maps     = [_name_map(r, key) for r in [r1, r2, r3]]
         all_keys = set().union(*[m.keys() for m in maps])
         accepted, rejected = [], []
         for k in all_keys:
+            voters  = [m[k] for m in maps if k in m]
+            n_votes = len(voters)
+            avg_pct = round(sum(v.get("pct",0) for v in voters) / n_votes)
+            papers  = sorted({p for v in voters for p in v.get("papers",[])})
+            evidence= next((v.get("evidence","") for v in voters if v.get("evidence")), "")
+            row = {"name": voters[0]["name"], "pct": avg_pct, "papers": papers,
+                   "evidence": evidence, "llm_votes": n_votes,
+                   "agreement": "Triple" if n_votes==3 else "Two" if n_votes==2 else "Single"}
             (accepted if n_votes >= 2 else rejected).append(row)
         return (sorted(accepted, key=lambda x: -x["pct"]),
                 sorted(rejected, key=lambda x: -x["pct"]))
+    methods_acc,    methods_rej    = _merge_items("methodologies")
     techniques_acc, techniques_rej = _merge_items("techniques")
     emp_avg  = round(sum(r.get("empirical_pct",  0) for r in [r1,r2,r3]) / 3)
     confirmed_votes = Counter(item for r in [r1,r2,r3] for item in r.get("regex_confirmed",[]))
     rejected_votes  = Counter(item for r in [r1,r2,r3] for item in r.get("regex_rejected",[]))
     dom_m = Counter(r.get("dominant_method","")    for r in [r1,r2,r3] if r).most_common(1)
     dom_t = Counter(r.get("dominant_technique","") for r in [r1,r2,r3] if r).most_common(1)
     return {
+        "methodologies": methods_acc, "techniques": techniques_acc,
+        "rejected_methods": methods_rej, "rejected_techniques": techniques_rej,
         "dominant_method":    dom_m[0][0] if dom_m else "—",
         "dominant_technique": dom_t[0][0] if dom_t else "—",
+        "empirical_pct": emp_avg, "theoretical_pct": theo_avg, "mixed_pct": mix_avg,
         "regex_confirmed_consensus": [k for k,v in confirmed_votes.items() if v>=2],
         "regex_rejected_consensus":  [k for k,v in rejected_votes.items()  if v>=2],
         "llm_raw": {"groq": r1, "mistral": r2, "gemini": r3},
     }
+def _consolidate_comp_techniques(r1: dict, r2: dict, r3: dict,
+                                  papers: list[dict]) -> dict:
+    """
+    Consolidate per-paper technique extraction from 3 LLMs.
+    Rule: a technique is accepted for a paper when ≥2 LLMs named it.
+    Builds per-LLM technique % and consolidated %.
+    """
+    all_paper_ids = [str(p["paper_idx"]) for p in papers]
+    def _get_per_paper(resp):
+        return resp.get("per_paper", {})
+    def _get_batch_pct(resp):
+        return resp.get("batch_technique_pct", {})
+    # Per-LLM batch percentages (for LLM sheets)
+    pct_groq    = {k.lower(): v for k,v in _get_batch_pct(r1).items()}
+    pct_mistral = {k.lower(): v for k,v in _get_batch_pct(r2).items()}
+    pct_gemini  = {k.lower(): v for k,v in _get_batch_pct(r3).items()}
+    all_tech_keys = set(pct_groq) | set(pct_mistral) | set(pct_gemini)
+    # ≥2 LLM gate for consolidated batch %
+    consolidated_pct = {}
+    for tk in all_tech_keys:
+        vals = [d[tk] for d in [pct_groq, pct_mistral, pct_gemini] if tk in d]
+        if len(vals) >= 2:
+            consolidated_pct[tk] = round(sum(vals) / len(vals))
+    # Per-paper consolidated techniques (≥2 LLMs must name the technique for that paper)
+    per_paper_groq    = _get_per_paper(r1)
+    per_paper_mistral = _get_per_paper(r2)
+    per_paper_gemini  = _get_per_paper(r3)
+    per_paper_consolidated = {}
+    for pid in all_paper_ids:
+        techs_groq    = set(t.lower() for t in per_paper_groq.get(pid,   {}).get("techniques", []))
+        techs_mistral = set(t.lower() for t in per_paper_mistral.get(pid,{}).get("techniques", []))
+        techs_gemini  = set(t.lower() for t in per_paper_gemini.get(pid, {}).get("techniques", []))
+        # Union of all named techniques
+        all_named = techs_groq | techs_mistral | techs_gemini
+        accepted  = [t for t in all_named
+                     if sum([t in techs_groq, t in techs_mistral, t in techs_gemini]) >= 2]
+        per_paper_consolidated[pid] = accepted
+    dom_g = r1.get("dominant_technique","—")
+    dom_m = r2.get("dominant_technique","—")
+    dom_gem = r3.get("dominant_technique","—")
+    dominant = Counter([dom_g, dom_m, dom_gem]).most_common(1)
+    return {
+        "per_paper_consolidated": per_paper_consolidated,
+        "consolidated_pct":       consolidated_pct,
+        "pct_groq":               pct_groq,
+        "pct_mistral":            pct_mistral,
+        "pct_gemini":             pct_gemini,
+        "dominant_technique":     dominant[0][0] if dominant else "—",
+        "raw": {"groq": r1, "mistral": r2, "gemini": r3},
+    }
+# ============================================================================
+# GROUNDING + CLEAN
+# ============================================================================
 def _grounding(label, keyphrases):
     if not label or not keyphrases: return {"verdict":"FAIL","score":0}
     lt = set(re.findall(r"\b[a-z]{3,}\b", label.lower()))
     return s[:60].rsplit(" ",1)[0] if len(s)>60 else s
+# ============================================================================
+# ORIGINAL NODES (DO NOT CHANGE)
+# ============================================================================
 def embed_and_cluster(state: PipelineState) -> dict:
     from tools import run_topic_modeling
     try:
         return {"error": str(e)}
 def llm_council(state: PipelineState) -> dict:
     td = state["topic_data"]
     if not td: return {"error": "No topic data"}
             "description":best.get("description",""),
             "pacis_match":best.get("pacis_match",""),
             "keyphrases":[k[0] if isinstance(k,tuple) else k for k in kps[:5]]}
         logger.info("Cluster %d → %s [%s]", cid, label, agreement)
     total    = len(sheets[4]) or 1
         pd.DataFrame(sheets[sn]).to_csv(path, index=False)
         sheet_paths[sn] = path
     with open("topics.json","w") as f: json.dump(sheets[4], f, indent=2)
     return {"interpretations":interps,"sheets":sheets,
             "agreement_rates":rates,"sheet_paths":sheet_paths,"json_path":"topics.json"}
 def optimization_loop(state: PipelineState) -> dict:
     n_opt = state.get("n_optimize", 1)
     if n_opt <= 1:
     for iteration in range(n_opt - 1):
         iter_num = iteration + 2
         logger.info("Optimization iteration %d / %d", iter_num, n_opt)
         for cid in sorted(interps.keys()):
             kps           = td["keyphrases"].get(cid, [])
             rds           = td["representative_docs"].get(cid, [])
     for cid, interp in interps.items():
         if cid in label_map:
             label_map[cid]["label"] = interp["label"]
     return {"interpretations":interps,"sheets":sheets,"refinement_log":refinement_log}
 def extract_methodology(state: PipelineState) -> dict:
+    """3-LLM council for cluster-level methodology (unchanged logic)."""
     td      = state["topic_data"]
     interps = state.get("interpretations", {})
     client  = Groq(api_key=state["groq_key"], max_retries=0)
     for cid in sorted(td["keyphrases"].keys()):
         rds   = td["representative_docs"].get(cid, [])
         label = interps.get(cid, {}).get("label", f"Cluster {cid}")
         scan       = _regex_scan(rds)
         regex_hint = _regex_summary(scan)
         logger.info("Cluster %d regex: %d method hits, %d technique hits",
                     cid, len(scan["methods"]), len(scan["techniques"]))
         prompt = _methodology_prompt(label, rds, regex_hint)
         r1 = _groq(client, prompt);    time.sleep(1)
         r2 = _mistral(prompt, mk);     time.sleep(1)
         r3 = _gemini(prompt, gk);      time.sleep(4)
         consolidated = _consolidate_methodology(r1, r2, r3, scan)
         methodology_data[cid] = consolidated
         logger.info("Cluster %d → dom_method: %s | dom_tech: %s",
                     cid, consolidated["dominant_method"], consolidated["dominant_technique"])
     return {"methodology_data": methodology_data}
 def collect_top_papers(state: PipelineState) -> dict:
     td      = state["topic_data"]
     interps = state.get("interpretations", {})
     top_papers = {}
     for cid in sorted(interps.keys()):
         rds   = td["representative_docs"].get(cid, [])
         label = interps.get(cid, {}).get("label", f"Cluster {cid}")
         for rank, doc in enumerate(rds[:3], start=1):
             title_part    = doc.split(". ")[0][:120] if ". " in doc else doc[:120]
             abstract_part = doc[len(title_part):].strip(". ")[:400]
+            papers.append({"rank":rank,"title":title_part,"abstract_snippet":abstract_part,
                            "cluster":cid,"cluster_label":label})
         top_papers[cid] = papers
     return {"top_papers": top_papers}
 def build_mismatch(state: PipelineState) -> dict:
     from tools import build_mismatch_table
     td         = state["topic_data"]
     return {"mismatch_table": build_mismatch_table(td["keyphrases"], labels_map)}
+# ============================================================================
+# NEW NODE 1: load_methodology_corpus
+# ============================================================================
+def load_methodology_corpus(state: PipelineState) -> dict:
+    """
+    Load the methodology CSV (title, doi, methodology).
+    Detect journal for each paper using JOURNAL_PATTERNS applied to doi + title.
+    Assigns paper_idx starting at 1.
+    Returns methodology_papers list ready for embedding and LLM extraction.
+    """
+    fpath = state.get("methodology_filepath")
+    if not fpath:
+        logger.info("No methodology CSV provided — skipping methodology pipeline.")
+        return {"methodology_papers": []}
+    df = pd.read_csv(fpath)
+    df.columns = df.columns.str.lower()
+    required = {"title","methodology"}
+    missing = required - set(df.columns)
+    if missing:
+        logger.warning("Methodology CSV missing columns: %s — skipping.", missing)
+        return {"methodology_papers": []}
+    if "doi" not in df.columns:
+        df["doi"] = "N/A"
+    papers = []
+    for idx, row in df.iterrows():
+        title      = str(row.get("title","") or "")
+        doi        = str(row.get("doi","N/A") or "N/A")
+        methodology= str(row.get("methodology","") or "")
+        journal    = _detect_journal(doi, title)
+        papers.append({
+            "paper_idx":   idx + 1,
+            "title":       title,
+            "doi":         doi,
+            "methodology": methodology,
+            "journal":     journal,
+        })
+    journals_found = Counter(p["journal"] for p in papers)
+    logger.info("Loaded %d methodology papers. Journals: %s", len(papers), dict(journals_found))
+    return {"methodology_papers": papers}
+# ============================================================================
+# NEW NODE 2: embed_methodology_vectors
+# ============================================================================
+def embed_methodology_vectors(state: PipelineState) -> dict:
+    """
+    Embed methodology text as a SEPARATE vector space from the corpus.
+    Uses the same SPECTER-2 model but applied to methodology text only.
+    Embeddings stored as a list of lists for JSON-serialisability.
+    """
+    papers = state.get("methodology_papers", [])
+    if not papers:
+        return {"methodology_embeddings": []}
+    from sentence_transformers import SentenceTransformer
+    texts = [p["methodology"][:1500] for p in papers]   # cap at 1500 chars
+    logger.info("Embedding %d methodology texts with SPECTER-2 (separate vector space)…", len(texts))
+    model = SentenceTransformer("allenai/specter2_base")
+    embeddings = model.encode(texts, show_progress_bar=True, batch_size=32)
+    logger.info("Methodology embeddings: %s", embeddings.shape)
+    return {"methodology_embeddings": embeddings.tolist()}
+# ============================================================================
+# NEW NODE 3: extract_comp_techniques  (3-LLM Council)
+# ============================================================================
+def extract_comp_techniques(state: PipelineState) -> dict:
+    """
+    3-LLM Council to extract computational techniques from methodology-CSV papers.
+    Pipeline per batch of BATCH_SIZE papers:
+      1. Regex pre-scan  → TECHNIQUE_PATTERNS on methodology text
+      2. Groq call       → per-paper techniques + batch %
+      3. Mistral call    → per-paper techniques + batch %
+      4. Gemini call     → per-paper techniques + batch %
+      5. Consolidate     → ≥2 LLM gate per (paper, technique)
+    Produces 4 sheets (mirroring topic sheets):
+      Sheet 1 = Groq raw
+      Sheet 2 = Mistral raw
+      Sheet 3 = Gemini raw
+      Sheet 4 = Consolidated (≥2 LLM agreement)
+    """
+    papers = state.get("methodology_papers", [])
+    if not papers:
+        return {"comp_technique_sheets": {1:[], 2:[], 3:[], 4:[]}}
+    client = Groq(api_key=state["groq_key"], max_retries=0)
+    mk, gk = state["mistral_key"], state["gemini_key"]
+    BATCH_SIZE = 5
+    sheets = {1:[], 2:[], 3:[], 4:[]}
+    # Accumulate consolidated per-paper techniques across batches
+    all_consolidated = {}   # {paper_idx: [technique_names]}
+    for batch_start in range(0, len(papers), BATCH_SIZE):
+        batch = papers[batch_start: batch_start + BATCH_SIZE]
+        batch_texts = [p["methodology"][:1500] for p in batch]
+        # Step 1 — regex pre-scan on batch
+        scan       = _regex_scan(batch_texts)
+        regex_hint = _regex_summary(scan)
+        logger.info("Batch %d-%d | regex: %d tech hits",
+                    batch[0]["paper_idx"], batch[-1]["paper_idx"], len(scan["techniques"]))
+        # Step 2 — 3 LLM calls
+        prompt = _comp_technique_batch_prompt(batch, regex_hint)
+        r1 = _groq(client, prompt);    time.sleep(1)
+        r2 = _mistral(prompt, mk);     time.sleep(1)
+        r3 = _gemini(prompt, gk);      time.sleep(4)
+        # Step 3 — consolidate
+        consolidated = _consolidate_comp_techniques(r1, r2, r3, batch)
+        # Build sheet rows — one row per paper per LLM
+        for p in batch:
+            pid  = str(p["paper_idx"])
+            journal = p["journal"]
+            title   = p["title"][:80]
+            def _fmt_llm(resp):
+                pp = resp.get("per_paper", {}).get(pid, {})
+                return {
+                    "paper_idx": p["paper_idx"],
+                    "title":     title,
+                    "journal":   journal,
+                    "techniques": ", ".join(pp.get("techniques", [])) or "—",
+                    "evidence":  " | ".join(pp.get("evidence", []))[:200] or "—",
+                    "confidence":pp.get("confidence","—"),
+                }
+            sheets[1].append(_fmt_llm(r1))
+            sheets[2].append(_fmt_llm(r2))
+            sheets[3].append(_fmt_llm(r3))
+            con_techs = consolidated["per_paper_consolidated"].get(pid, [])
+            sheets[4].append({
+                "paper_idx":  p["paper_idx"],
+                "title":      title,
+                "journal":    journal,
+                "techniques": ", ".join(con_techs) or "—",
+                "n_techniques": len(con_techs),
+                "dominant":   consolidated["dominant_technique"],
+            })
+            all_consolidated[p["paper_idx"]] = con_techs
+        logger.info("Batch consolidated dominant: %s", consolidated["dominant_technique"])
+    # Save 4 sheets as CSV
+    sheet_names = {1:"tech_sheet1_groq",2:"tech_sheet2_mistral",
+                   3:"tech_sheet3_gemini",4:"tech_sheet4_consolidated"}
+    for sn, name in sheet_names.items():
+        pd.DataFrame(sheets[sn]).to_csv(f"{name}.csv", index=False)
+    # Attach per_paper_consolidated back to papers for cross-tab use
+    for p in papers:
+        p["consolidated_techniques"] = all_consolidated.get(p["paper_idx"], [])
+    return {
+        "comp_technique_sheets": sheets,
+        "methodology_papers":    papers,   # updated with consolidated_techniques
+    }
+# ============================================================================
+# NEW NODE 4: build_journal_crosstab
+# ============================================================================
+def build_journal_crosstab(state: PipelineState) -> dict:
+    """
+    Build a technique × journal cross-tabulation.
+    For each journal in the methodology CSV, compute what % of papers in that
+    journal mention each consolidated technique.
+    Also produces per-LLM technique percentage tables for inter-LLM comparison.
+    """
+    papers = state.get("methodology_papers", [])
+    if not papers:
+        return {"journal_crosstab": {}}
+    sheets = state.get("comp_technique_sheets", {})
+    # --- Consolidated cross-tab ---
+    journal_tech_counts = defaultdict(lambda: defaultdict(int))
+    journal_paper_counts = defaultdict(int)
+    for p in papers:
+        journal = p["journal"]
+        journal_paper_counts[journal] += 1
+        for tech in p.get("consolidated_techniques", []):
+            journal_tech_counts[journal][tech.title()] += 1
+    journals      = sorted(journal_paper_counts.keys())
+    all_techniques = sorted({t for j in journal_tech_counts.values() for t in j.keys()})
+    crosstab = {}
+    for journal in journals:
+        n = journal_paper_counts[journal] or 1
+        crosstab[journal] = {
+            tech: round(journal_tech_counts[journal].get(tech, 0) / n * 100)
+            for tech in all_techniques
+        }
+    # --- Per-LLM technique frequency across ALL papers ---
+    def _llm_tech_freq(sheet_rows: list) -> dict:
+        tech_count = defaultdict(int)
+        n_papers   = len(sheet_rows) or 1
+        for row in sheet_rows:
+            raw = row.get("techniques","")
+            for t in (raw.split(", ") if raw and raw != "—" else []):
+                tech_count[t.strip().title()] += 1
+        return {t: round(c/n_papers*100) for t,c in tech_count.items()}
+    per_llm_freq = {
+        "Groq":    _llm_tech_freq(sheets.get(1,[])),
+        "Mistral": _llm_tech_freq(sheets.get(2,[])),
+        "Gemini":  _llm_tech_freq(sheets.get(3,[])),
+    }
+    logger.info("Journal crosstab: %d journals × %d techniques",
+                len(journals), len(all_techniques))
+    return {
+        "journal_crosstab": {
+            "consolidated": crosstab,
+            "journals":     journals,
+            "techniques":   all_techniques,
+            "journal_paper_counts": dict(journal_paper_counts),
+            "per_llm_freq": per_llm_freq,
+        }
+    }
+# ============================================================================
+# NEW NODE 5: optimize_technique_labels
+# ============================================================================
+def optimize_technique_labels(state: PipelineState) -> dict:
+    """
+    Optimization / improvement pass for computational technique labels.
+    Runs Groq critic on each consolidated technique found across all journals.
+    Checks: hallucination, high inter-LLM variance, merge/split suggestions.
+    Stores improvement suggestions in technique_opt_log for display in UI.
+    Only applies optimisation when n_optimize > 1.
+    """
+    n_opt = state.get("n_optimize", 1)
+    if n_opt <= 1:
+        return {"technique_opt_log": []}
+    crosstab_data = state.get("journal_crosstab", {})
+    all_techniques = crosstab_data.get("techniques", [])
+    if not all_techniques:
+        return {"technique_opt_log": []}
+    client   = Groq(api_key=state["groq_key"], max_retries=0)
+    per_llm  = crosstab_data.get("per_llm_freq", {})
+    papers   = state.get("methodology_papers", [])
+    opt_log  = []
+    # Sample evidence quotes for each technique from methodology texts
+    def _evidence_for(technique: str) -> list[str]:
+        tech_lower = technique.lower()
+        samples = []
+        for p in papers[:30]:                     # cap at first 30 papers for speed
+            text = p.get("methodology","")
+            for pat in TECHNIQUE_PATTERNS.values():
+                for m in pat.finditer(text):
+                    if tech_lower in m.group(0).lower() or technique.lower() in tech_lower:
+                        snippet = text[max(0,m.start()-40):m.end()+40].replace("\n"," ")
+                        samples.append(snippet[:120])
+            if len(samples) >= 3:
+                break
+        return samples[:3]
+    for tech in all_techniques:
+        pct_g   = per_llm.get("Groq",{}).get(tech, 0)
+        pct_m   = per_llm.get("Mistral",{}).get(tech, 0)
+        pct_gem = per_llm.get("Gemini",{}).get(tech, 0)
+        evidence= _evidence_for(tech)
+        # Only run critique if there is meaningful inter-LLM variance or low confidence
+        max_pct = max(pct_g, pct_m, pct_gem)
+        min_pct = min(pct_g, pct_m, pct_gem)
+        run_critique = (max_pct - min_pct) > 15 or max_pct < 20
+        if not run_critique:
+            continue
+        critique = _groq(client,
+            _technique_critique_prompt(tech, "All Journals", pct_g, pct_m, pct_gem, evidence))
+        time.sleep(0.8)
+        if not critique:
+            continue
+        opt_log.append({
+            "technique":              tech,
+            "refined_name":           critique.get("refined_name", tech),
+            "is_hallucination":       critique.get("is_hallucination", False),
+            "high_variance":          critique.get("high_variance_across_llms", False),
+            "suggestion":             critique.get("suggestion","—"),
+            "split_into":             ", ".join(critique.get("split_into",[])  or []) or "—",
+            "merge_with":             critique.get("merge_with","—") or "—",
+            "pct_groq":               pct_g,
+            "pct_mistral":            pct_m,
+            "pct_gemini":             pct_gem,
+            "confidence":             critique.get("confidence", 0),
+        })
+        logger.info("Technique opt: '%s' → '%s'", tech, critique.get("refined_name",tech))
+    return {"technique_opt_log": opt_log}
+# ============================================================================
+# GRAPH ASSEMBLY
+# ============================================================================
 def build_graph() -> StateGraph:
     g = StateGraph(PipelineState)
+    # ── original nodes ───────────────────────────────────────────────────────
     g.add_node("embed_and_cluster",   embed_and_cluster)
     g.add_node("llm_council",         llm_council)
     g.add_node("optimization_loop",   optimization_loop)
     g.add_node("extract_methodology", extract_methodology)
     g.add_node("collect_top_papers",  collect_top_papers)
     g.add_node("build_mismatch",      build_mismatch)
+    # ── new methodology-CSV nodes ─────────────────────────────────────────────
+    g.add_node("load_methodology_corpus",   load_methodology_corpus)
+    g.add_node("embed_methodology_vectors", embed_methodology_vectors)
+    g.add_node("extract_comp_techniques",   extract_comp_techniques)
+    g.add_node("build_journal_crosstab",    build_journal_crosstab)
+    g.add_node("optimize_technique_labels", optimize_technique_labels)
+    # ── original edges (unchanged) ────────────────────────────────────────────
     g.set_entry_point("embed_and_cluster")
     g.add_edge("embed_and_cluster",   "llm_council")
     g.add_edge("llm_council",         "optimization_loop")
     g.add_edge("optimization_loop",   "extract_methodology")
     g.add_edge("extract_methodology", "collect_top_papers")
     g.add_edge("collect_top_papers",  "build_mismatch")
+    # ── new edges: methodology CSV pipeline runs after core pipeline ──────────
+    g.add_edge("build_mismatch",            "load_methodology_corpus")
+    g.add_edge("load_methodology_corpus",   "embed_methodology_vectors")
+    g.add_edge("embed_methodology_vectors", "extract_comp_techniques")
+    g.add_edge("extract_comp_techniques",   "build_journal_crosstab")
+    g.add_edge("build_journal_crosstab",    "optimize_technique_labels")
+    g.add_edge("optimize_technique_labels", END)
     return g.compile()
 pipeline_graph = build_graph()
 def run_pipeline(filepath, groq_key, mistral_key, gemini_key,
+                 n_trials=50, n_optimize=1, methodology_filepath=None):
+    """Convenience wrapper — methodology_filepath is optional."""
     return pipeline_graph.invoke({
+        "filepath":             filepath,
+        "groq_key":             groq_key,
+        "mistral_key":          mistral_key,
+        "gemini_key":           gemini_key,
+        "n_trials":             n_trials,
+        "n_optimize":           n_optimize,
+        "methodology_filepath": methodology_filepath,
     })