BERTopic_AG_final

Sleeping

App Files Files Community

BHAVIKBANKER commited on 21 days ago

Commit

a627b52

verified ·

1 Parent(s): 64049b0

Update app.py

Browse files

Files changed (1) hide show

app.py +377 -204

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
-"""app.py — Gradio UI entry point.
-Tabs: Summary, UMAP, Pareto, Trial Log, Clusters, Top 3 Papers,
-      Methodology (3-LLM council + regex pipeline), Refinement Log,
-      Sheet 1-4, RQ Mismatch, Downloads.
 """
 import os, json
 import pandas as pd, numpy as np
@@ -28,7 +29,24 @@ def _preview(file):
         f"**Usable papers:** {n - max(blanks_t, blanks_a)} / {n}")
-# ── Helper builders ──────────────────────────────────────────────────────────
 def _top_papers_df(top_papers: dict) -> pd.DataFrame:
     rows = []
     for cid in sorted(top_papers.keys()):
@@ -65,124 +83,78 @@ def _methodology_summary_df(methodology_data: dict, interps: dict) -> pd.DataFra
 def _extraction_pipeline_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
-    """
-    One row per (cluster, method/technique) showing the full extraction trace:
-    which regex pattern fired, what text it matched, which LLMs confirmed it,
-    and whether it passed the ≥2-LLM gate.
-    """
     rows = []
     for cid in sorted(methodology_data.keys()):
         md    = methodology_data[cid]
         label = interps.get(cid, {}).get("label", f"Cluster {cid}")
         scan  = md.get("regex_scan", {})
-        # Accepted items
         for item in md.get("methodologies", []) + md.get("techniques", []):
-            name   = item["name"]
-            # Find regex hits for this category name
-            regex_hits = scan.get("methods", {}).get(name, []) or \
-                         scan.get("techniques", {}).get(name, [])
-            matched_text = ", ".join(
-                dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "—"
-            rows.append({
-                "Cluster":       cid,
-                "Label":         label,
-                "Item":          name,
-                "Type":          "Method" if item in md.get("methodologies",[]) else "Technique",
-                "Regex Match":   matched_text,
-                "Regex Fired":   "✅" if regex_hits else "❌",
-                "LLM Votes":     item["llm_votes"],
-                "Agreement":     item["agreement"],
-                "Avg Pct (%)":   item["pct"],
-                "Evidence":      item.get("evidence", "—"),
-                "Gate Passed":   "✅ ACCEPTED",
-            })
-        # Rejected items (single LLM only)
-        for item in md.get("rejected_methods", []) + md.get("rejected_techniques", []):
             name      = item["name"]
-            regex_hits = scan.get("methods", {}).get(name, []) or \
-                         scan.get("techniques", {}).get(name, [])
-            matched_text = ", ".join(
-                dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "—"
-            rows.append({
-                "Cluster":       cid,
-                "Label":         label,
-                "Item":          name,
-                "Type":          "Method" if item in md.get("rejected_methods",[]) else "Technique",
-                "Regex Match":   matched_text,
-                "Regex Fired":   "✅" if regex_hits else "❌",
-                "LLM Votes":     item["llm_votes"],
-                "Agreement":     item["agreement"],
-                "Avg Pct (%)":   item["pct"],
-                "Evidence":      item.get("evidence", "—"),
-                "Gate Passed":   "❌ REJECTED (single LLM)",
-            })
     return pd.DataFrame(rows) if rows else pd.DataFrame()
 def _per_llm_methodology_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
-    """Per-LLM raw methodology responses side-by-side."""
     rows = []
     for cid in sorted(methodology_data.keys()):
-        md    = methodology_data[cid]
-        label = interps.get(cid, {}).get("label", f"Cluster {cid}")
-        raw   = md.get("llm_raw", {})
         def _fmt(r, key):
-            return " | ".join(
-                f"{i['name']} ({i.get('pct',0)}%)"
-                for i in r.get(key, [])
-            ) or "—"
-        rows.append({
-            "Cluster":             cid,
-            "Label":               label,
-            "Groq Methods":        _fmt(raw.get("groq",{}),    "methodologies"),
-            "Mistral Methods":     _fmt(raw.get("mistral",{}), "methodologies"),
-            "Gemini Methods":      _fmt(raw.get("gemini",{}),  "methodologies"),
-            "Groq Techniques":     _fmt(raw.get("groq",{}),    "techniques"),
-            "Mistral Techniques":  _fmt(raw.get("mistral",{}), "techniques"),
-            "Gemini Techniques":   _fmt(raw.get("gemini",{}),  "techniques"),
-            "Groq Emp/Theo/Mix":   f"{raw.get('groq',{}).get('empirical_pct',0)}/"
-                                   f"{raw.get('groq',{}).get('theoretical_pct',0)}/"
-                                   f"{raw.get('groq',{}).get('mixed_pct',0)}",
-            "Mistral Emp/Theo/Mix":f"{raw.get('mistral',{}).get('empirical_pct',0)}/"
-                                   f"{raw.get('mistral',{}).get('theoretical_pct',0)}/"
-                                   f"{raw.get('mistral',{}).get('mixed_pct',0)}",
-            "Gemini Emp/Theo/Mix": f"{raw.get('gemini',{}).get('empirical_pct',0)}/"
-                                   f"{raw.get('gemini',{}).get('theoretical_pct',0)}/"
-                                   f"{raw.get('gemini',{}).get('mixed_pct',0)}",
         })
     return pd.DataFrame(rows)
 def _regex_hits_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
-    """
-    One row per (cluster, pattern, matched text) so the user can see exactly
-    which regex fired on which word in which paper.
-    """
     rows = []
     for cid in sorted(methodology_data.keys()):
-        md    = methodology_data[cid]
-        label = interps.get(cid, {}).get("label", f"Cluster {cid}")
-        scan  = md.get("regex_scan", {})
-        for category, hits in scan.get("methods", {}).items():
             for h in hits:
-                rows.append({"Cluster": cid, "Label": label,
-                             "Bank": "Methodology", "Pattern Category": category,
-                             "Matched Text": h["match"], "Paper #": h["doc"],
-                             "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
-        for category, hits in scan.get("techniques", {}).items():
             for h in hits:
-                rows.append({"Cluster": cid, "Label": label,
-                             "Bank": "Technique", "Pattern Category": category,
-                             "Matched Text": h["match"], "Paper #": h["doc"],
-                             "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
     return pd.DataFrame(rows) if rows else pd.DataFrame()
@@ -194,82 +166,197 @@ def _methodology_bar_chart(methodology_data: dict, interps: dict) -> go.Figure:
         empirical.append(md.get("empirical_pct", 0))
         theoretical.append(md.get("theoretical_pct", 0))
         mixed.append(md.get("mixed_pct", 0))
     fig = go.Figure()
     fig.add_trace(go.Bar(name="Empirical %",   x=labels_list, y=empirical,   marker_color="#3dba7a"))
     fig.add_trace(go.Bar(name="Theoretical %", x=labels_list, y=theoretical, marker_color="#5b9cf6"))
     fig.add_trace(go.Bar(name="Mixed %",       x=labels_list, y=mixed,       marker_color="#f5a623"))
-    fig.update_layout(
-        barmode="stack", template="plotly_dark", height=420,
         paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
         title="Research Orientation per Cluster — Averaged across Groq + Mistral + Gemini",
         xaxis_title="Cluster", yaxis_title="Percentage (%)",
-        font=dict(size=11), legend=dict(orientation="h", y=1.12),
-        xaxis_tickangle=-35,
-    )
     return fig
 def _regex_pattern_info() -> str:
     m_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in METHODOLOGY_PATTERNS.items())
     t_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in TECHNIQUE_PATTERNS.items())
     return (
-        "### How Methodology Extraction Works\n\n"
-        "**Step 1 — Regex Pre-Scan**  \n"
-        "Two compiled pattern banks (case-insensitive) are run against each representative abstract. "
-        "Every match is recorded with its exact character span, matched text, and paper number. "
-        "This produces ground-truth hints that are injected into the LLM prompt.\n\n"
-        "**Step 2 — 3-LLM Council**  \n"
-        "Groq (llama-3.1-8b), Mistral (mistral-small), and Gemini (gemini-2.5-flash) each receive "
-        "the same prompt: the regex evidence + the full abstracts. Each LLM must confirm or reject "
-        "the regex hits and may add methods/techniques it finds in the text. "
-        "Each LLM also provides an evidence quote (≤15 words) for every item it names.\n\n"
-        "**Step 3 — Consolidation (≥2-LLM gate)**  \n"
-        "A method or technique only survives if at least 2 out of 3 LLMs named it. "
-        "Percentages are averaged across agreeing LLMs. Items named by only one LLM are marked "
-        "REJECTED and shown in the extraction pipeline table.\n\n"
-        "**Step 4 — Orientation Percentages**  \n"
-        "Empirical / Theoretical / Mixed percentages are averaged across all 3 LLMs and shown "
-        "in the stacked bar chart above.\n\n"
-        "---\n\n"
-        "#### Methodology Pattern Bank\n" + m_list +
-        "\n\n#### Technique Pattern Bank\n" + t_list
-    )
-def _refinement_df(refinement_log: list) -> pd.DataFrame:
-    if not refinement_log:
-        return pd.DataFrame(columns=["Cluster","Iteration","Old Label","New Label",
-                                     "Issues","Improvement","Hallucination Detected"])
     return pd.DataFrame([{
-        "Cluster":               r["cluster"],
-        "Iteration":             r["iteration"],
-        "Old Label":             r["old_label"],
-        "New Label":             r["new_label"],
-        "Issues":                "; ".join(r.get("issues",[])),
-        "Improvement":           r["improvement_score"],
-        "Hallucination Detected":r["hallucination_detected"],
-    } for r in refinement_log])
 # ── Pipeline runner ──────────────────────────────────────────────────────────
-def _run(file, gk, mk, gek, n_trials, n_optimize,
          progress=gr.Progress(track_tqdm=True)):
-    if not file: raise gr.Error("Upload a CSV first.")
     gk  = gk.strip()  or os.getenv("GROQ_API_KEY","")
     mk  = mk.strip()  or os.getenv("MISTRAL_API_KEY","")
     gek = gek.strip() or os.getenv("GEMINI_API_KEY","")
     if not all([gk,mk,gek]): raise gr.Error("All 3 API keys required.")
     progress(0.05, desc="📥 Loading CSV…")
-    progress(0.10, desc="🔬 Embedding with SPECTER-2 (this takes a few minutes)…")
-    r = run_pipeline(file.name, gk, mk, gek, int(n_trials), int(n_optimize))
     if r.get("error"): raise gr.Error(r["error"])
     progress(0.85, desc="📊 Building outputs…")
-    td, interps = r["topic_data"], r.get("interpretations", {})
     disc, met   = td["discipline"], td["metrics"]
-    ar          = r.get("agreement_rates", {})
     rl          = r.get("refinement_log", [])
     def _s(ok): return "✅ PASS" if ok else "❌ FAIL"
@@ -286,6 +373,7 @@ def _run(file, gk, mk, gek, n_trials, n_optimize,
         f"**Optimization passes:** {n_optimize} · **Labels refined:** {len(rl)}"
     )
     u2d = np.array(td["umap_2d"])
     sdf = pd.DataFrame({"UMAP-1":u2d[:,0],"UMAP-2":u2d[:,1],
         "Cluster":[str(l) for l in td["labels"]],
@@ -296,6 +384,7 @@ def _run(file, gk, mk, gek, n_trials, n_optimize,
     fig.update_layout(template="plotly_dark", height=500,
         paper_bgcolor="#0d1117", plot_bgcolor="#161b22", font=dict(size=11))
     tl = pd.DataFrame(td["trial_log"])
     tl_cols = [c for c in ["trial","discipline_pass","n_clusters","persistence",
         "dbcv","max_mass_pct","min_size","n_noise"] if c in tl.columns]
@@ -330,31 +419,56 @@ def _run(file, gk, mk, gek, n_trials, n_optimize,
     sp = r.get("sheet_paths",{})
     mdf = pd.DataFrame(r.get("mismatch_table",[]))
-    md_data = r.get("methodology_data", {})
-    top_papers_df       = _top_papers_df(r.get("top_papers", {}))
-    method_summary_df   = _methodology_summary_df(md_data, interps)
-    method_chart        = _methodology_bar_chart(md_data, interps)
-    extraction_df       = _extraction_pipeline_df(md_data, interps)
-    per_llm_df          = _per_llm_methodology_df(md_data, interps)
-    regex_hits_df       = _regex_hits_df(md_data, interps)
-    pattern_info        = _regex_pattern_info()
-    refine_df           = _refinement_df(rl)
     progress(1.0, desc="✅ Done!")
     dl_files = [f for f in [sp.get(1),sp.get(2),sp.get(3),sp.get(4),r.get("json_path")] if f]
-    return (summary, fig, pfig, tl_show, cdf,
-            top_papers_df,
-            method_chart, method_summary_df, extraction_df, per_llm_df,
-            regex_hits_df, pattern_info,
-            refine_df,
-            s1, s2, s3, s4,
-            dl_files if dl_files else None,
-            mdf)
-# ── UI ───────────────────────────────────────────────────────────────────────
 css = ".gradio-container{background:#0d1117!important;color:#c9d1d9!important}" \
       "footer{display:none!important}"
@@ -363,25 +477,39 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"),
     gr.Markdown("# 📐 SPECTER-2 Topic Analyzer")
     with gr.Row():
         with gr.Column(scale=1):
-            file_in    = gr.File(label="Upload Scopus CSV", file_types=[".csv"])
             preview_out = gr.Markdown("Upload a CSV to see stats.")
             groq_in    = gr.Textbox(label="Groq API Key", type="password",
                             placeholder="or set GROQ_API_KEY env var")
             mistral_in = gr.Textbox(label="Mistral API Key", type="password",
                             placeholder="or set MISTRAL_API_KEY env var")
             gemini_in  = gr.Textbox(label="Gemini API Key", type="password",
                             placeholder="or set GEMINI_API_KEY env var")
-            trials_in  = gr.Slider(10, 100, 50, step=5, label="Optuna Trials")
             optimize_in = gr.Slider(1, 5, 1, step=1,
                             label="🔁 Optimization Passes",
-                            info="Each pass: LLM critic audits labels for hallucinations. "
-                                 "1 = disabled. 2–5 = progressive refinement.")
-            run_btn    = gr.Button("▶ Run Full Pipeline", variant="primary", size="lg")
         with gr.Column(scale=3):
             with gr.Tabs():
                 with gr.Tab("Summary"):
                     summary_out = gr.Markdown()
@@ -399,66 +527,104 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"),
                 with gr.Tab("🗞 Top 3 Papers"):
                     gr.Markdown("### Top 3 Representative Papers per Cluster\n"
-                                "Ranked by cosine similarity to the cluster centroid "
                                 "in SPECTER-2 embedding space.")
                     top_papers_out = gr.Dataframe(
                         headers=["Cluster","Label","Rank","Title","Abstract Snippet"],
                         wrap=True)
-                with gr.Tab("🔬 Methodology — Summary"):
-                    gr.Markdown("### Consolidated Methodology Results\n"
-                                "Only items agreed by **≥ 2 out of 3 LLMs** (Groq + Mistral + Gemini) "
-                                "appear here. Percentages averaged across agreeing LLMs.")
-                    method_chart_out  = gr.Plot()
                     method_summary_out = gr.Dataframe(wrap=True)
-                with gr.Tab("⚙ Methodology — Extraction Pipeline"):
-                    gr.Markdown("### Full Extraction Trace\n"
-                                "One row per method/technique showing: which regex pattern fired, "
-                                "the exact matched text, how many LLMs agreed, and whether it "
-                                "passed the ≥2-LLM gate.")
                     extraction_out = gr.Dataframe(wrap=True)
-                with gr.Tab("🤖 Methodology — Per-LLM Votes"):
-                    gr.Markdown("### Raw Per-LLM Methodology Responses\n"
-                                "Side-by-side view of what each LLM independently extracted "
-                                "before consolidation.")
                     per_llm_out = gr.Dataframe(wrap=True)
-                with gr.Tab("🔍 Regex Hits"):
-                    gr.Markdown("### Regex Pattern Matches\n"
-                                "Every regex match with its exact character span, matched text, "
-                                "and which paper (1–3) it came from. This is the ground-truth "
-                                "evidence fed to all 3 LLMs.")
-                    regex_hits_out  = gr.Dataframe(wrap=True)
-                    regex_info_out  = gr.Markdown()
                 with gr.Tab("🔁 Refinement Log"):
-                    gr.Markdown("### Optimization Refinement Log\n"
-                                "Changes made by the Groq critic per optimization pass. "
-                                "A label is only changed when improvement_score > 0.15 "
-                                "OR hallucination was detected, AND the new label passes "
-                                "the keyphrase grounding check.")
-                    refine_out = gr.Dataframe(
-                        headers=["Cluster","Iteration","Old Label","New Label",
-                                 "Issues","Improvement","Hallucination Detected"],
-                        wrap=True)
                 with gr.Tab("Sheet 1 — Groq"):    s1_out = gr.Dataframe()
                 with gr.Tab("Sheet 2 — Mistral"): s2_out = gr.Dataframe()
                 with gr.Tab("Sheet 3 — Gemini"):  s3_out = gr.Dataframe()
                 with gr.Tab("Sheet 4 — Consolidated"): s4_out = gr.Dataframe()
-                with gr.Tab("RQ Mismatch"):       mismatch_out = gr.Dataframe()
                 with gr.Tab("Downloads"):
                     dl_out = gr.File(label="All sheet CSVs + topics.json",
                                      file_count="multiple")
-    file_in.change(_preview, inputs=[file_in], outputs=[preview_out])
     run_btn.click(
         _run,
-        inputs=[file_in, groq_in, mistral_in, gemini_in, trials_in, optimize_in],
         outputs=[
             summary_out, scatter_out, pareto_out, trial_out, cluster_out,
             top_papers_out,
             method_chart_out, method_summary_out, extraction_out, per_llm_out,
@@ -466,6 +632,13 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"),
             refine_out,
             s1_out, s2_out, s3_out, s4_out,
             dl_out, mismatch_out,
         ],
     )

+"""
+app.py — Gradio UI entry point.
+ORIGINAL structure and all tabs preserved.
+NEW: second file upload for methodology CSV, technique sheets 1-4,
+     journal cross-tabulation chart + table, technique optimisation log.
 """
 import os, json
 import pandas as pd, numpy as np
         f"**Usable papers:** {n - max(blanks_t, blanks_a)} / {n}")
+def _preview_methodology(file):
+    if not file: return "Upload methodology CSV (title, doi, methodology) to enable technique analysis."
+    df = pd.read_csv(file.name)
+    df.columns = df.columns.str.lower()
+    has_t = "title"        in df.columns
+    has_m = "methodology"  in df.columns
+    has_d = "doi"          in df.columns
+    n = len(df)
+    ok = "✅" if has_t and has_m else "❌"
+    return (f"## {ok} Methodology CSV — {n} papers\n\n"
+        f"| Column | Present |\n|---|---|\n"
+        f"| title | {'✅' if has_t else '❌'} |\n"
+        f"| doi | {'✅' if has_d else '⚠ optional'} |\n"
+        f"| methodology | {'✅' if has_m else '❌'} |\n\n"
+        f"Journals will be auto-detected from DOI + title.")
+# ── Original helper builders ─────────────────────────────────────────────────
 def _top_papers_df(top_papers: dict) -> pd.DataFrame:
     rows = []
     for cid in sorted(top_papers.keys()):
 def _extraction_pipeline_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
     rows = []
     for cid in sorted(methodology_data.keys()):
         md    = methodology_data[cid]
         label = interps.get(cid, {}).get("label", f"Cluster {cid}")
         scan  = md.get("regex_scan", {})
         for item in md.get("methodologies", []) + md.get("techniques", []):
             name      = item["name"]
+            regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[])
+            matched   = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "—"
+            rows.append({"Cluster": cid, "Label": label, "Item": name,
+                "Type":       "Method" if item in md.get("methodologies",[]) else "Technique",
+                "Regex Match":matched, "Regex Fired": "✅" if regex_hits else "❌",
+                "LLM Votes":  item["llm_votes"], "Agreement": item["agreement"],
+                "Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","—"),
+                "Gate Passed":"✅ ACCEPTED"})
+        for item in md.get("rejected_methods",[]) + md.get("rejected_techniques",[]):
+            name      = item["name"]
+            regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[])
+            matched   = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "—"
+            rows.append({"Cluster": cid, "Label": label, "Item": name,
+                "Type":       "Method" if item in md.get("rejected_methods",[]) else "Technique",
+                "Regex Match":matched, "Regex Fired": "✅" if regex_hits else "❌",
+                "LLM Votes":  item["llm_votes"], "Agreement": item["agreement"],
+                "Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","—"),
+                "Gate Passed":"❌ REJECTED (single LLM)"})
     return pd.DataFrame(rows) if rows else pd.DataFrame()
 def _per_llm_methodology_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
     rows = []
     for cid in sorted(methodology_data.keys()):
+        md  = methodology_data[cid]
+        label = interps.get(cid,{}).get("label", f"Cluster {cid}")
+        raw = md.get("llm_raw",{})
         def _fmt(r, key):
+            return " | ".join(f"{i['name']} ({i.get('pct',0)}%)" for i in r.get(key,[])) or "—"
+        rows.append({"Cluster": cid, "Label": label,
+            "Groq Methods":       _fmt(raw.get("groq",{}),    "methodologies"),
+            "Mistral Methods":    _fmt(raw.get("mistral",{}), "methodologies"),
+            "Gemini Methods":     _fmt(raw.get("gemini",{}),  "methodologies"),
+            "Groq Techniques":    _fmt(raw.get("groq",{}),    "techniques"),
+            "Mistral Techniques": _fmt(raw.get("mistral",{}), "techniques"),
+            "Gemini Techniques":  _fmt(raw.get("gemini",{}),  "techniques"),
+            "Groq E/T/M":    f"{raw.get('groq',{}).get('empirical_pct',0)}/"
+                             f"{raw.get('groq',{}).get('theoretical_pct',0)}/"
+                             f"{raw.get('groq',{}).get('mixed_pct',0)}",
+            "Mistral E/T/M": f"{raw.get('mistral',{}).get('empirical_pct',0)}/"
+                             f"{raw.get('mistral',{}).get('theoretical_pct',0)}/"
+                             f"{raw.get('mistral',{}).get('mixed_pct',0)}",
+            "Gemini E/T/M":  f"{raw.get('gemini',{}).get('empirical_pct',0)}/"
+                             f"{raw.get('gemini',{}).get('theoretical_pct',0)}/"
+                             f"{raw.get('gemini',{}).get('mixed_pct',0)}",
         })
     return pd.DataFrame(rows)
 def _regex_hits_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
     rows = []
     for cid in sorted(methodology_data.keys()):
+        md  = methodology_data[cid]
+        label = interps.get(cid,{}).get("label", f"Cluster {cid}")
+        scan  = md.get("regex_scan",{})
+        for category, hits in scan.get("methods",{}).items():
             for h in hits:
+                rows.append({"Cluster": cid, "Label": label, "Bank": "Methodology",
+                    "Pattern Category": category, "Matched Text": h["match"],
+                    "Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
+        for category, hits in scan.get("techniques",{}).items():
             for h in hits:
+                rows.append({"Cluster": cid, "Label": label, "Bank": "Technique",
+                    "Pattern Category": category, "Matched Text": h["match"],
+                    "Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
     return pd.DataFrame(rows) if rows else pd.DataFrame()
         empirical.append(md.get("empirical_pct", 0))
         theoretical.append(md.get("theoretical_pct", 0))
         mixed.append(md.get("mixed_pct", 0))
     fig = go.Figure()
     fig.add_trace(go.Bar(name="Empirical %",   x=labels_list, y=empirical,   marker_color="#3dba7a"))
     fig.add_trace(go.Bar(name="Theoretical %", x=labels_list, y=theoretical, marker_color="#5b9cf6"))
     fig.add_trace(go.Bar(name="Mixed %",       x=labels_list, y=mixed,       marker_color="#f5a623"))
+    fig.update_layout(barmode="stack", template="plotly_dark", height=420,
         paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
         title="Research Orientation per Cluster — Averaged across Groq + Mistral + Gemini",
         xaxis_title="Cluster", yaxis_title="Percentage (%)",
+        font=dict(size=11), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-35)
     return fig
+def _refinement_df(rl: list) -> pd.DataFrame:
+    if not rl:
+        return pd.DataFrame(columns=["Cluster","Iteration","Old Label","New Label",
+                                     "Issues","Improvement","Hallucination Detected"])
+    return pd.DataFrame([{
+        "Cluster": r["cluster"], "Iteration": r["iteration"],
+        "Old Label": r["old_label"], "New Label": r["new_label"],
+        "Issues": "; ".join(r.get("issues",[])),
+        "Improvement": r["improvement_score"],
+        "Hallucination Detected": r["hallucination_detected"],
+    } for r in rl])
 def _regex_pattern_info() -> str:
     m_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in METHODOLOGY_PATTERNS.items())
     t_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in TECHNIQUE_PATTERNS.items())
     return (
+        "### How Cluster Methodology Extraction Works\n\n"
+        "**Step 1 — Regex Pre-Scan:** Two compiled pattern banks run against representative "
+        "abstracts. Every match recorded with exact character span, matched text, paper number.\n\n"
+        "**Step 2 — 3-LLM Council:** Groq, Mistral, Gemini each receive regex evidence + abstracts. "
+        "Each LLM confirms/rejects regex hits and adds any missed methods/techniques.\n\n"
+        "**Step 3 — ≥2-LLM Gate:** Only items named by ≥2 LLMs survive. Percentages averaged.\n\n"
+        "**Step 4 — Orientation:** Empirical/Theoretical/Mixed averaged across 3 LLMs.\n\n"
+        "---\n\n#### Methodology Bank\n" + m_list +
+        "\n\n#### Technique Bank\n" + t_list)
+# ── NEW helpers for methodology-CSV pipeline ─────────────────────────────────
+def _tech_sheet_df(sheet_rows: list) -> pd.DataFrame:
+    return pd.DataFrame(sheet_rows) if sheet_rows else pd.DataFrame()
+def _tech_llm_pct_chart(comp_sheets: dict) -> go.Figure:
+    """
+    Grouped bar: for each technique, show the % of papers it was found in
+    by each of the 3 LLMs (Groq, Mistral, Gemini) + Consolidated.
+    """
+    s1 = comp_sheets.get(1, [])
+    s2 = comp_sheets.get(2, [])
+    s3 = comp_sheets.get(3, [])
+    s4 = comp_sheets.get(4, [])
+    def _freq(rows):
+        counts = {}
+        n = len(rows) or 1
+        for row in rows:
+            for t in (row.get("techniques","") or "").split(", "):
+                t = t.strip().title()
+                if t and t != "—":
+                    counts[t] = counts.get(t,0) + 1
+        return {k: round(v/n*100) for k,v in counts.items()}
+    f1 = _freq(s1); f2 = _freq(s2); f3 = _freq(s3); f4 = _freq(s4)
+    all_techs = sorted(set(f1)|set(f2)|set(f3)|set(f4))
+    fig = go.Figure()
+    fig.add_trace(go.Bar(name="Groq",         x=all_techs, y=[f1.get(t,0) for t in all_techs], marker_color="#5b9cf6"))
+    fig.add_trace(go.Bar(name="Mistral",       x=all_techs, y=[f2.get(t,0) for t in all_techs], marker_color="#f5a623"))
+    fig.add_trace(go.Bar(name="Gemini",        x=all_techs, y=[f3.get(t,0) for t in all_techs], marker_color="#a855f7"))
+    fig.add_trace(go.Bar(name="Consolidated",  x=all_techs, y=[f4.get(t,0) for t in all_techs], marker_color="#3dba7a"))
+    fig.update_layout(barmode="group", template="plotly_dark", height=480,
+        paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
+        title="Computational Technique Frequency — % of Papers per LLM (Groq / Mistral / Gemini / Consolidated)",
+        xaxis_title="Technique", yaxis_title="% of papers",
+        font=dict(size=10), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-40)
+    return fig
+def _journal_crosstab_chart(journal_crosstab: dict) -> go.Figure:
+    """
+    Grouped bar: for each technique, show % usage per journal.
+    Journals on x-axis, techniques as bar groups.
+    """
+    ct        = journal_crosstab.get("consolidated", {})
+    journals  = journal_crosstab.get("journals", [])
+    techniques= journal_crosstab.get("techniques", [])
+    if not journals or not techniques:
+        fig = go.Figure()
+        fig.update_layout(template="plotly_dark", title="No journal data available",
+                          paper_bgcolor="#0d1117")
+        return fig
+    COLORS = ["#5b9cf6","#3dba7a","#f5a623","#e04d4d","#a855f7","#06b6d4",
+              "#f97316","#84cc16","#ec4899","#14b8a6","#8b5cf6","#ef4444"]
+    fig = go.Figure()
+    for i, tech in enumerate(techniques[:15]):   # cap at 15 techniques for readability
+        pcts = [ct.get(j,{}).get(tech, 0) for j in journals]
+        fig.add_trace(go.Bar(name=tech, x=journals, y=pcts,
+                             marker_color=COLORS[i % len(COLORS)]))
+    fig.update_layout(barmode="group", template="plotly_dark", height=500,
+        paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
+        title="Computational Technique Usage — Cross-Tabulation by Journal (%)",
+        xaxis_title="Journal", yaxis_title="% of papers using technique",
+        font=dict(size=10), legend=dict(orientation="h", y=1.15), xaxis_tickangle=-20)
+    return fig
+def _journal_crosstab_df(journal_crosstab: dict) -> pd.DataFrame:
+    ct        = journal_crosstab.get("consolidated", {})
+    journals  = journal_crosstab.get("journals", [])
+    techniques= journal_crosstab.get("techniques", [])
+    paper_counts = journal_crosstab.get("journal_paper_counts", {})
+    rows = []
+    for j in journals:
+        row = {"Journal": j, "N Papers": paper_counts.get(j,0)}
+        for t in techniques:
+            row[t] = f"{ct.get(j,{}).get(t,0)}%"
+        rows.append(row)
+    return pd.DataFrame(rows)
+def _tech_opt_df(opt_log: list) -> pd.DataFrame:
+    if not opt_log:
+        return pd.DataFrame(columns=["Technique","Refined Name","Hallucination",
+                                     "High Variance","Groq %","Mistral %","Gemini %",
+                                     "Suggestion","Split Into","Merge With"])
     return pd.DataFrame([{
+        "Technique":      r["technique"],
+        "Refined Name":   r["refined_name"],
+        "Hallucination":  r["is_hallucination"],
+        "High Variance":  r["high_variance"],
+        "Groq %":         r["pct_groq"],
+        "Mistral %":      r["pct_mistral"],
+        "Gemini %":       r["pct_gemini"],
+        "Suggestion":     r["suggestion"],
+        "Split Into":     r["split_into"],
+        "Merge With":     r["merge_with"],
+    } for r in opt_log])
+def _per_llm_freq_df(journal_crosstab: dict) -> pd.DataFrame:
+    """Per-LLM technique frequency across all papers in methodology CSV."""
+    per_llm = journal_crosstab.get("per_llm_freq", {})
+    techniques = sorted(set(t for d in per_llm.values() for t in d.keys()))
+    rows = []
+    for t in techniques:
+        rows.append({
+            "Technique":  t,
+            "Groq %":     per_llm.get("Groq",{}).get(t, 0),
+            "Mistral %":  per_llm.get("Mistral",{}).get(t, 0),
+            "Gemini %":   per_llm.get("Gemini",{}).get(t, 0),
+            "Variance":   round(max(
+                per_llm.get("Groq",{}).get(t,0),
+                per_llm.get("Mistral",{}).get(t,0),
+                per_llm.get("Gemini",{}).get(t,0),
+            ) - min(
+                per_llm.get("Groq",{}).get(t,0),
+                per_llm.get("Mistral",{}).get(t,0),
+                per_llm.get("Gemini",{}).get(t,0),
+            )),
+        })
+    return pd.DataFrame(rows).sort_values("Groq %", ascending=False)
 # ── Pipeline runner ──────────────────────────────────────────────────────────
+def _run(corpus_file, method_file, gk, mk, gek, n_trials, n_optimize,
          progress=gr.Progress(track_tqdm=True)):
+    if not corpus_file: raise gr.Error("Upload a Scopus corpus CSV first.")
     gk  = gk.strip()  or os.getenv("GROQ_API_KEY","")
     mk  = mk.strip()  or os.getenv("MISTRAL_API_KEY","")
     gek = gek.strip() or os.getenv("GEMINI_API_KEY","")
     if not all([gk,mk,gek]): raise gr.Error("All 3 API keys required.")
+    method_path = method_file.name if method_file else None
     progress(0.05, desc="📥 Loading CSV…")
+    progress(0.10, desc="🔬 Embedding corpus with SPECTER-2…")
+    r = run_pipeline(corpus_file.name, gk, mk, gek,
+                     int(n_trials), int(n_optimize), method_path)
     if r.get("error"): raise gr.Error(r["error"])
     progress(0.85, desc="📊 Building outputs…")
+    td, interps = r["topic_data"], r.get("interpretations",{})
     disc, met   = td["discipline"], td["metrics"]
+    ar          = r.get("agreement_rates",{})
     rl          = r.get("refinement_log", [])
     def _s(ok): return "✅ PASS" if ok else "❌ FAIL"
         f"**Optimization passes:** {n_optimize} · **Labels refined:** {len(rl)}"
     )
+    # UMAP scatter
     u2d = np.array(td["umap_2d"])
     sdf = pd.DataFrame({"UMAP-1":u2d[:,0],"UMAP-2":u2d[:,1],
         "Cluster":[str(l) for l in td["labels"]],
     fig.update_layout(template="plotly_dark", height=500,
         paper_bgcolor="#0d1117", plot_bgcolor="#161b22", font=dict(size=11))
+    # Trial log + Pareto
     tl = pd.DataFrame(td["trial_log"])
     tl_cols = [c for c in ["trial","discipline_pass","n_clusters","persistence",
         "dbcv","max_mass_pct","min_size","n_noise"] if c in tl.columns]
     sp = r.get("sheet_paths",{})
     mdf = pd.DataFrame(r.get("mismatch_table",[]))
+    md_data  = r.get("methodology_data",{})
+    top_papers_df    = _top_papers_df(r.get("top_papers",{}))
+    method_sum_df    = _methodology_summary_df(md_data, interps)
+    method_chart     = _methodology_bar_chart(md_data, interps)
+    extraction_df    = _extraction_pipeline_df(md_data, interps)
+    per_llm_meth_df  = _per_llm_methodology_df(md_data, interps)
+    regex_hits_df    = _regex_hits_df(md_data, interps)
+    pattern_info     = _regex_pattern_info()
+    refine_df        = _refinement_df(rl)
+    # ── NEW: methodology-CSV outputs ─────────────────────────────────────────
+    comp_sheets  = r.get("comp_technique_sheets", {1:[], 2:[], 3:[], 4:[]})
+    jct          = r.get("journal_crosstab", {})
+    tech_opt_log = r.get("technique_opt_log", [])
+    tech_s1 = _tech_sheet_df(comp_sheets.get(1,[]))
+    tech_s2 = _tech_sheet_df(comp_sheets.get(2,[]))
+    tech_s3 = _tech_sheet_df(comp_sheets.get(3,[]))
+    tech_s4 = _tech_sheet_df(comp_sheets.get(4,[]))
+    tech_llm_chart    = _tech_llm_pct_chart(comp_sheets)
+    jct_chart         = _journal_crosstab_chart(jct)
+    jct_df            = _journal_crosstab_df(jct)
+    per_llm_freq_df   = _per_llm_freq_df(jct)
+    tech_opt_df       = _tech_opt_df(tech_opt_log)
     progress(1.0, desc="✅ Done!")
     dl_files = [f for f in [sp.get(1),sp.get(2),sp.get(3),sp.get(4),r.get("json_path")] if f]
+    return (
+        # ── original outputs (order preserved) ───────────────────────────────
+        summary, fig, pfig, tl_show, cdf,
+        top_papers_df,
+        method_chart, method_sum_df, extraction_df, per_llm_meth_df,
+        regex_hits_df, pattern_info,
+        refine_df,
+        s1, s2, s3, s4,
+        dl_files if dl_files else None,
+        mdf,
+        # ── new outputs ───────────────────────────────────────────────────────
+        tech_llm_chart,
+        tech_s1, tech_s2, tech_s3, tech_s4,
+        per_llm_freq_df,
+        jct_chart,
+        jct_df,
+        tech_opt_df,
+    )
+# ── UI ────────────────────────────────────────────────────────────────────────
 css = ".gradio-container{background:#0d1117!important;color:#c9d1d9!important}" \
       "footer{display:none!important}"
     gr.Markdown("# 📐 SPECTER-2 Topic Analyzer")
     with gr.Row():
+        # ── Left sidebar ─────────────────────────────────────────────────────
         with gr.Column(scale=1):
+            gr.Markdown("### 📄 Corpus CSV")
+            file_in    = gr.File(label="Upload Scopus CSV (title + abstract)",
+                                 file_types=[".csv"])
             preview_out = gr.Markdown("Upload a CSV to see stats.")
+            gr.Markdown("### 🔬 Methodology CSV *(optional)*")
+            method_file_in   = gr.File(label="Upload Methodology CSV (title, doi, methodology)",
+                                       file_types=[".csv"])
+            method_preview   = gr.Markdown("Upload methodology CSV to enable technique analysis.")
+            gr.Markdown("### 🔑 API Keys")
             groq_in    = gr.Textbox(label="Groq API Key", type="password",
                             placeholder="or set GROQ_API_KEY env var")
             mistral_in = gr.Textbox(label="Mistral API Key", type="password",
                             placeholder="or set MISTRAL_API_KEY env var")
             gemini_in  = gr.Textbox(label="Gemini API Key", type="password",
                             placeholder="or set GEMINI_API_KEY env var")
+            gr.Markdown("### ⚙ Parameters")
+            trials_in   = gr.Slider(10, 100, 50, step=5, label="Optuna Trials")
             optimize_in = gr.Slider(1, 5, 1, step=1,
                             label="🔁 Optimization Passes",
+                            info="Pass 1 = no refinement. 2–5 = LLM critic audits topic labels "
+                                 "AND technique labels for hallucinations + improvements.")
+            run_btn = gr.Button("▶ Run Full Pipeline", variant="primary", size="lg")
+        # ── Main panel ────────────────────────────────────────────────────────
         with gr.Column(scale=3):
             with gr.Tabs():
+                # ── original tabs (order / content unchanged) ─────────────────
                 with gr.Tab("Summary"):
                     summary_out = gr.Markdown()
                 with gr.Tab("🗞 Top 3 Papers"):
                     gr.Markdown("### Top 3 Representative Papers per Cluster\n"
+                                "Ranked by cosine similarity to cluster centroid "
                                 "in SPECTER-2 embedding space.")
                     top_papers_out = gr.Dataframe(
                         headers=["Cluster","Label","Rank","Title","Abstract Snippet"],
                         wrap=True)
+                with gr.Tab("🔬 Cluster Methodology"):
+                    gr.Markdown("### Cluster-Level Methodology — 3-LLM Council\n"
+                                "Derived from representative abstracts per cluster. "
+                                "≥2-LLM gate applied.")
+                    method_chart_out   = gr.Plot()
                     method_summary_out = gr.Dataframe(wrap=True)
+                with gr.Tab("⚙ Cluster Extraction Pipeline"):
+                    gr.Markdown("### Full Regex + LLM Extraction Trace (per cluster)")
                     extraction_out = gr.Dataframe(wrap=True)
+                with gr.Tab("🤖 Cluster Per-LLM Votes"):
+                    gr.Markdown("### Raw Per-LLM Methodology Votes (per cluster)")
                     per_llm_out = gr.Dataframe(wrap=True)
+                with gr.Tab("🔍 Cluster Regex Hits"):
+                    gr.Markdown("### Regex Pattern Matches (per cluster)\n"
+                                "Every match with exact character span and paper number.")
+                    regex_hits_out = gr.Dataframe(wrap=True)
+                    regex_info_out = gr.Markdown()
                 with gr.Tab("🔁 Refinement Log"):
+                    gr.Markdown("### Topic Label Optimization Log\n"
+                                "Changes made by LLM critic per optimization pass.")
+                    refine_out = gr.Dataframe(wrap=True)
                 with gr.Tab("Sheet 1 — Groq"):    s1_out = gr.Dataframe()
                 with gr.Tab("Sheet 2 — Mistral"): s2_out = gr.Dataframe()
                 with gr.Tab("Sheet 3 — Gemini"):  s3_out = gr.Dataframe()
                 with gr.Tab("Sheet 4 — Consolidated"): s4_out = gr.Dataframe()
+                with gr.Tab("RQ Mismatch"):        mismatch_out = gr.Dataframe()
                 with gr.Tab("Downloads"):
                     dl_out = gr.File(label="All sheet CSVs + topics.json",
                                      file_count="multiple")
+                # ── NEW tabs: methodology CSV pipeline ────────────────────────
+                with gr.Tab("💻 Comp. Techniques — LLM % Chart"):
+                    gr.Markdown("### Computational Technique Frequency — Methodology CSV\n"
+                                "For each technique, shows the % of papers it was extracted "
+                                "from by each of the 3 LLMs independently + the consolidated "
+                                "result (≥2-LLM gate). Bars grouped by technique.")
+                    tech_llm_chart_out = gr.Plot()
+                with gr.Tab("💻 Tech Sheet 1 — Groq"):
+                    gr.Markdown("### Groq raw technique extraction — one row per paper")
+                    tech_s1_out = gr.Dataframe(wrap=True)
+                with gr.Tab("💻 Tech Sheet 2 — Mistral"):
+                    gr.Markdown("### Mistral raw technique extraction — one row per paper")
+                    tech_s2_out = gr.Dataframe(wrap=True)
+                with gr.Tab("💻 Tech Sheet 3 — Gemini"):
+                    gr.Markdown("### Gemini raw technique extraction — one row per paper")
+                    tech_s3_out = gr.Dataframe(wrap=True)
+                with gr.Tab("💻 Tech Sheet 4 — Consolidated"):
+                    gr.Markdown("### Consolidated techniques — ≥2-LLM agreement, one row per paper")
+                    tech_s4_out = gr.Dataframe(wrap=True)
+                with gr.Tab("📊 Tech Frequency by LLM"):
+                    gr.Markdown("### Per-LLM Technique Frequency Table\n"
+                                "% of all papers where each LLM extracted each technique. "
+                                "High variance = LLMs disagree → optimization flag.")
+                    per_llm_freq_out = gr.Dataframe(wrap=True)
+                with gr.Tab("🗂 Journal Cross-Tabulation"):
+                    gr.Markdown("### Technique × Journal Cross-Tabulation\n"
+                                "Rows = journals auto-detected from DOI/title. "
+                                "Columns = consolidated techniques. "
+                                "Values = % of papers in that journal using the technique.\n\n"
+                                "**Journals detected:** MISQ, JAIS, ISR, JMIS, PAJAIS, "
+                                "ECIS, ICIS, Other.")
+                    jct_chart_out = gr.Plot()
+                    jct_df_out    = gr.Dataframe(wrap=True)
+                with gr.Tab("🔧 Technique Optimization"):
+                    gr.Markdown("### Technique Label Improvement Suggestions\n"
+                                "Groq critic flags: hallucination, high inter-LLM variance "
+                                "(>15% gap), split/merge recommendations.\n"
+                                "Only runs when Optimization Passes ≥ 2.")
+                    tech_opt_out = gr.Dataframe(wrap=True)
+    # ── Wire callbacks ────────────────────────────────────────────────────────
+    file_in.change(_preview,            inputs=[file_in],        outputs=[preview_out])
+    method_file_in.change(_preview_methodology, inputs=[method_file_in], outputs=[method_preview])
     run_btn.click(
         _run,
+        inputs=[file_in, method_file_in, groq_in, mistral_in, gemini_in,
+                trials_in, optimize_in],
         outputs=[
+            # original
             summary_out, scatter_out, pareto_out, trial_out, cluster_out,
             top_papers_out,
             method_chart_out, method_summary_out, extraction_out, per_llm_out,
             refine_out,
             s1_out, s2_out, s3_out, s4_out,
             dl_out, mismatch_out,
+            # new
+            tech_llm_chart_out,
+            tech_s1_out, tech_s2_out, tech_s3_out, tech_s4_out,
+            per_llm_freq_out,
+            jct_chart_out,
+            jct_df_out,
+            tech_opt_out,
         ],
     )