BHAVIKBANKER commited on
Commit
a627b52
Β·
verified Β·
1 Parent(s): 64049b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +377 -204
app.py CHANGED
@@ -1,7 +1,8 @@
1
- """app.py β€” Gradio UI entry point.
2
- Tabs: Summary, UMAP, Pareto, Trial Log, Clusters, Top 3 Papers,
3
- Methodology (3-LLM council + regex pipeline), Refinement Log,
4
- Sheet 1-4, RQ Mismatch, Downloads.
 
5
  """
6
  import os, json
7
  import pandas as pd, numpy as np
@@ -28,7 +29,24 @@ def _preview(file):
28
  f"**Usable papers:** {n - max(blanks_t, blanks_a)} / {n}")
29
 
30
 
31
- # ── Helper builders ──────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def _top_papers_df(top_papers: dict) -> pd.DataFrame:
33
  rows = []
34
  for cid in sorted(top_papers.keys()):
@@ -65,124 +83,78 @@ def _methodology_summary_df(methodology_data: dict, interps: dict) -> pd.DataFra
65
 
66
 
67
  def _extraction_pipeline_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
68
- """
69
- One row per (cluster, method/technique) showing the full extraction trace:
70
- which regex pattern fired, what text it matched, which LLMs confirmed it,
71
- and whether it passed the β‰₯2-LLM gate.
72
- """
73
  rows = []
74
  for cid in sorted(methodology_data.keys()):
75
  md = methodology_data[cid]
76
  label = interps.get(cid, {}).get("label", f"Cluster {cid}")
77
  scan = md.get("regex_scan", {})
78
-
79
- # Accepted items
80
  for item in md.get("methodologies", []) + md.get("techniques", []):
81
- name = item["name"]
82
- # Find regex hits for this category name
83
- regex_hits = scan.get("methods", {}).get(name, []) or \
84
- scan.get("techniques", {}).get(name, [])
85
- matched_text = ", ".join(
86
- dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "β€”"
87
- rows.append({
88
- "Cluster": cid,
89
- "Label": label,
90
- "Item": name,
91
- "Type": "Method" if item in md.get("methodologies",[]) else "Technique",
92
- "Regex Match": matched_text,
93
- "Regex Fired": "βœ…" if regex_hits else "❌",
94
- "LLM Votes": item["llm_votes"],
95
- "Agreement": item["agreement"],
96
- "Avg Pct (%)": item["pct"],
97
- "Evidence": item.get("evidence", "β€”"),
98
- "Gate Passed": "βœ… ACCEPTED",
99
- })
100
-
101
- # Rejected items (single LLM only)
102
- for item in md.get("rejected_methods", []) + md.get("rejected_techniques", []):
103
  name = item["name"]
104
- regex_hits = scan.get("methods", {}).get(name, []) or \
105
- scan.get("techniques", {}).get(name, [])
106
- matched_text = ", ".join(
107
- dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "β€”"
108
- rows.append({
109
- "Cluster": cid,
110
- "Label": label,
111
- "Item": name,
112
- "Type": "Method" if item in md.get("rejected_methods",[]) else "Technique",
113
- "Regex Match": matched_text,
114
- "Regex Fired": "βœ…" if regex_hits else "❌",
115
- "LLM Votes": item["llm_votes"],
116
- "Agreement": item["agreement"],
117
- "Avg Pct (%)": item["pct"],
118
- "Evidence": item.get("evidence", "β€”"),
119
- "Gate Passed": "❌ REJECTED (single LLM)",
120
- })
121
-
122
  return pd.DataFrame(rows) if rows else pd.DataFrame()
123
 
124
 
125
  def _per_llm_methodology_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
126
- """Per-LLM raw methodology responses side-by-side."""
127
  rows = []
128
  for cid in sorted(methodology_data.keys()):
129
- md = methodology_data[cid]
130
- label = interps.get(cid, {}).get("label", f"Cluster {cid}")
131
- raw = md.get("llm_raw", {})
132
-
133
  def _fmt(r, key):
134
- return " | ".join(
135
- f"{i['name']} ({i.get('pct',0)}%)"
136
- for i in r.get(key, [])
137
- ) or "β€”"
138
-
139
- rows.append({
140
- "Cluster": cid,
141
- "Label": label,
142
- "Groq Methods": _fmt(raw.get("groq",{}), "methodologies"),
143
- "Mistral Methods": _fmt(raw.get("mistral",{}), "methodologies"),
144
- "Gemini Methods": _fmt(raw.get("gemini",{}), "methodologies"),
145
- "Groq Techniques": _fmt(raw.get("groq",{}), "techniques"),
146
- "Mistral Techniques": _fmt(raw.get("mistral",{}), "techniques"),
147
- "Gemini Techniques": _fmt(raw.get("gemini",{}), "techniques"),
148
- "Groq Emp/Theo/Mix": f"{raw.get('groq',{}).get('empirical_pct',0)}/"
149
- f"{raw.get('groq',{}).get('theoretical_pct',0)}/"
150
- f"{raw.get('groq',{}).get('mixed_pct',0)}",
151
- "Mistral Emp/Theo/Mix":f"{raw.get('mistral',{}).get('empirical_pct',0)}/"
152
- f"{raw.get('mistral',{}).get('theoretical_pct',0)}/"
153
- f"{raw.get('mistral',{}).get('mixed_pct',0)}",
154
- "Gemini Emp/Theo/Mix": f"{raw.get('gemini',{}).get('empirical_pct',0)}/"
155
- f"{raw.get('gemini',{}).get('theoretical_pct',0)}/"
156
- f"{raw.get('gemini',{}).get('mixed_pct',0)}",
157
  })
158
  return pd.DataFrame(rows)
159
 
160
 
161
  def _regex_hits_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
162
- """
163
- One row per (cluster, pattern, matched text) so the user can see exactly
164
- which regex fired on which word in which paper.
165
- """
166
  rows = []
167
  for cid in sorted(methodology_data.keys()):
168
- md = methodology_data[cid]
169
- label = interps.get(cid, {}).get("label", f"Cluster {cid}")
170
- scan = md.get("regex_scan", {})
171
-
172
- for category, hits in scan.get("methods", {}).items():
173
  for h in hits:
174
- rows.append({"Cluster": cid, "Label": label,
175
- "Bank": "Methodology", "Pattern Category": category,
176
- "Matched Text": h["match"], "Paper #": h["doc"],
177
- "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
178
-
179
- for category, hits in scan.get("techniques", {}).items():
180
  for h in hits:
181
- rows.append({"Cluster": cid, "Label": label,
182
- "Bank": "Technique", "Pattern Category": category,
183
- "Matched Text": h["match"], "Paper #": h["doc"],
184
- "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
185
-
186
  return pd.DataFrame(rows) if rows else pd.DataFrame()
187
 
188
 
@@ -194,82 +166,197 @@ def _methodology_bar_chart(methodology_data: dict, interps: dict) -> go.Figure:
194
  empirical.append(md.get("empirical_pct", 0))
195
  theoretical.append(md.get("theoretical_pct", 0))
196
  mixed.append(md.get("mixed_pct", 0))
197
-
198
  fig = go.Figure()
199
  fig.add_trace(go.Bar(name="Empirical %", x=labels_list, y=empirical, marker_color="#3dba7a"))
200
  fig.add_trace(go.Bar(name="Theoretical %", x=labels_list, y=theoretical, marker_color="#5b9cf6"))
201
  fig.add_trace(go.Bar(name="Mixed %", x=labels_list, y=mixed, marker_color="#f5a623"))
202
- fig.update_layout(
203
- barmode="stack", template="plotly_dark", height=420,
204
  paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
205
  title="Research Orientation per Cluster β€” Averaged across Groq + Mistral + Gemini",
206
  xaxis_title="Cluster", yaxis_title="Percentage (%)",
207
- font=dict(size=11), legend=dict(orientation="h", y=1.12),
208
- xaxis_tickangle=-35,
209
- )
210
  return fig
211
 
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  def _regex_pattern_info() -> str:
214
  m_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in METHODOLOGY_PATTERNS.items())
215
  t_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in TECHNIQUE_PATTERNS.items())
216
  return (
217
- "### How Methodology Extraction Works\n\n"
218
- "**Step 1 β€” Regex Pre-Scan** \n"
219
- "Two compiled pattern banks (case-insensitive) are run against each representative abstract. "
220
- "Every match is recorded with its exact character span, matched text, and paper number. "
221
- "This produces ground-truth hints that are injected into the LLM prompt.\n\n"
222
- "**Step 2 β€” 3-LLM Council** \n"
223
- "Groq (llama-3.1-8b), Mistral (mistral-small), and Gemini (gemini-2.5-flash) each receive "
224
- "the same prompt: the regex evidence + the full abstracts. Each LLM must confirm or reject "
225
- "the regex hits and may add methods/techniques it finds in the text. "
226
- "Each LLM also provides an evidence quote (≀15 words) for every item it names.\n\n"
227
- "**Step 3 β€” Consolidation (β‰₯2-LLM gate)** \n"
228
- "A method or technique only survives if at least 2 out of 3 LLMs named it. "
229
- "Percentages are averaged across agreeing LLMs. Items named by only one LLM are marked "
230
- "REJECTED and shown in the extraction pipeline table.\n\n"
231
- "**Step 4 β€” Orientation Percentages** \n"
232
- "Empirical / Theoretical / Mixed percentages are averaged across all 3 LLMs and shown "
233
- "in the stacked bar chart above.\n\n"
234
- "---\n\n"
235
- "#### Methodology Pattern Bank\n" + m_list +
236
- "\n\n#### Technique Pattern Bank\n" + t_list
237
- )
238
 
239
 
240
- def _refinement_df(refinement_log: list) -> pd.DataFrame:
241
- if not refinement_log:
242
- return pd.DataFrame(columns=["Cluster","Iteration","Old Label","New Label",
243
- "Issues","Improvement","Hallucination Detected"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  return pd.DataFrame([{
245
- "Cluster": r["cluster"],
246
- "Iteration": r["iteration"],
247
- "Old Label": r["old_label"],
248
- "New Label": r["new_label"],
249
- "Issues": "; ".join(r.get("issues",[])),
250
- "Improvement": r["improvement_score"],
251
- "Hallucination Detected":r["hallucination_detected"],
252
- } for r in refinement_log])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
 
255
  # ── Pipeline runner ──────────────────────────────────────────────────────────
256
- def _run(file, gk, mk, gek, n_trials, n_optimize,
257
  progress=gr.Progress(track_tqdm=True)):
258
- if not file: raise gr.Error("Upload a CSV first.")
259
  gk = gk.strip() or os.getenv("GROQ_API_KEY","")
260
  mk = mk.strip() or os.getenv("MISTRAL_API_KEY","")
261
  gek = gek.strip() or os.getenv("GEMINI_API_KEY","")
262
  if not all([gk,mk,gek]): raise gr.Error("All 3 API keys required.")
263
 
 
 
264
  progress(0.05, desc="πŸ“₯ Loading CSV…")
265
- progress(0.10, desc="πŸ”¬ Embedding with SPECTER-2 (this takes a few minutes)…")
266
- r = run_pipeline(file.name, gk, mk, gek, int(n_trials), int(n_optimize))
 
267
  if r.get("error"): raise gr.Error(r["error"])
268
-
269
  progress(0.85, desc="πŸ“Š Building outputs…")
270
- td, interps = r["topic_data"], r.get("interpretations", {})
 
271
  disc, met = td["discipline"], td["metrics"]
272
- ar = r.get("agreement_rates", {})
273
  rl = r.get("refinement_log", [])
274
 
275
  def _s(ok): return "βœ… PASS" if ok else "❌ FAIL"
@@ -286,6 +373,7 @@ def _run(file, gk, mk, gek, n_trials, n_optimize,
286
  f"**Optimization passes:** {n_optimize} Β· **Labels refined:** {len(rl)}"
287
  )
288
 
 
289
  u2d = np.array(td["umap_2d"])
290
  sdf = pd.DataFrame({"UMAP-1":u2d[:,0],"UMAP-2":u2d[:,1],
291
  "Cluster":[str(l) for l in td["labels"]],
@@ -296,6 +384,7 @@ def _run(file, gk, mk, gek, n_trials, n_optimize,
296
  fig.update_layout(template="plotly_dark", height=500,
297
  paper_bgcolor="#0d1117", plot_bgcolor="#161b22", font=dict(size=11))
298
 
 
299
  tl = pd.DataFrame(td["trial_log"])
300
  tl_cols = [c for c in ["trial","discipline_pass","n_clusters","persistence",
301
  "dbcv","max_mass_pct","min_size","n_noise"] if c in tl.columns]
@@ -330,31 +419,56 @@ def _run(file, gk, mk, gek, n_trials, n_optimize,
330
  sp = r.get("sheet_paths",{})
331
  mdf = pd.DataFrame(r.get("mismatch_table",[]))
332
 
333
- md_data = r.get("methodology_data", {})
334
-
335
- top_papers_df = _top_papers_df(r.get("top_papers", {}))
336
- method_summary_df = _methodology_summary_df(md_data, interps)
337
- method_chart = _methodology_bar_chart(md_data, interps)
338
- extraction_df = _extraction_pipeline_df(md_data, interps)
339
- per_llm_df = _per_llm_methodology_df(md_data, interps)
340
- regex_hits_df = _regex_hits_df(md_data, interps)
341
- pattern_info = _regex_pattern_info()
342
- refine_df = _refinement_df(rl)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  progress(1.0, desc="βœ… Done!")
345
  dl_files = [f for f in [sp.get(1),sp.get(2),sp.get(3),sp.get(4),r.get("json_path")] if f]
346
 
347
- return (summary, fig, pfig, tl_show, cdf,
348
- top_papers_df,
349
- method_chart, method_summary_df, extraction_df, per_llm_df,
350
- regex_hits_df, pattern_info,
351
- refine_df,
352
- s1, s2, s3, s4,
353
- dl_files if dl_files else None,
354
- mdf)
 
 
 
 
 
 
 
 
 
 
355
 
356
 
357
- # ── UI ───────────────────────────────────────────────────────────────────────
358
  css = ".gradio-container{background:#0d1117!important;color:#c9d1d9!important}" \
359
  "footer{display:none!important}"
360
 
@@ -363,25 +477,39 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"),
363
  gr.Markdown("# πŸ“ SPECTER-2 Topic Analyzer")
364
 
365
  with gr.Row():
 
366
  with gr.Column(scale=1):
367
- file_in = gr.File(label="Upload Scopus CSV", file_types=[".csv"])
 
 
368
  preview_out = gr.Markdown("Upload a CSV to see stats.")
 
 
 
 
 
 
 
369
  groq_in = gr.Textbox(label="Groq API Key", type="password",
370
  placeholder="or set GROQ_API_KEY env var")
371
  mistral_in = gr.Textbox(label="Mistral API Key", type="password",
372
  placeholder="or set MISTRAL_API_KEY env var")
373
  gemini_in = gr.Textbox(label="Gemini API Key", type="password",
374
  placeholder="or set GEMINI_API_KEY env var")
375
- trials_in = gr.Slider(10, 100, 50, step=5, label="Optuna Trials")
 
 
376
  optimize_in = gr.Slider(1, 5, 1, step=1,
377
  label="πŸ” Optimization Passes",
378
- info="Each pass: LLM critic audits labels for hallucinations. "
379
- "1 = disabled. 2–5 = progressive refinement.")
380
- run_btn = gr.Button("β–Ά Run Full Pipeline", variant="primary", size="lg")
381
 
 
382
  with gr.Column(scale=3):
383
  with gr.Tabs():
384
 
 
385
  with gr.Tab("Summary"):
386
  summary_out = gr.Markdown()
387
 
@@ -399,66 +527,104 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"),
399
 
400
  with gr.Tab("πŸ—ž Top 3 Papers"):
401
  gr.Markdown("### Top 3 Representative Papers per Cluster\n"
402
- "Ranked by cosine similarity to the cluster centroid "
403
  "in SPECTER-2 embedding space.")
404
  top_papers_out = gr.Dataframe(
405
  headers=["Cluster","Label","Rank","Title","Abstract Snippet"],
406
  wrap=True)
407
 
408
- with gr.Tab("πŸ”¬ Methodology β€” Summary"):
409
- gr.Markdown("### Consolidated Methodology Results\n"
410
- "Only items agreed by **β‰₯ 2 out of 3 LLMs** (Groq + Mistral + Gemini) "
411
- "appear here. Percentages averaged across agreeing LLMs.")
412
- method_chart_out = gr.Plot()
413
  method_summary_out = gr.Dataframe(wrap=True)
414
 
415
- with gr.Tab("βš™ Methodology β€” Extraction Pipeline"):
416
- gr.Markdown("### Full Extraction Trace\n"
417
- "One row per method/technique showing: which regex pattern fired, "
418
- "the exact matched text, how many LLMs agreed, and whether it "
419
- "passed the β‰₯2-LLM gate.")
420
  extraction_out = gr.Dataframe(wrap=True)
421
 
422
- with gr.Tab("πŸ€– Methodology β€” Per-LLM Votes"):
423
- gr.Markdown("### Raw Per-LLM Methodology Responses\n"
424
- "Side-by-side view of what each LLM independently extracted "
425
- "before consolidation.")
426
  per_llm_out = gr.Dataframe(wrap=True)
427
 
428
- with gr.Tab("πŸ” Regex Hits"):
429
- gr.Markdown("### Regex Pattern Matches\n"
430
- "Every regex match with its exact character span, matched text, "
431
- "and which paper (1–3) it came from. This is the ground-truth "
432
- "evidence fed to all 3 LLMs.")
433
- regex_hits_out = gr.Dataframe(wrap=True)
434
- regex_info_out = gr.Markdown()
435
 
436
  with gr.Tab("πŸ” Refinement Log"):
437
- gr.Markdown("### Optimization Refinement Log\n"
438
- "Changes made by the Groq critic per optimization pass. "
439
- "A label is only changed when improvement_score > 0.15 "
440
- "OR hallucination was detected, AND the new label passes "
441
- "the keyphrase grounding check.")
442
- refine_out = gr.Dataframe(
443
- headers=["Cluster","Iteration","Old Label","New Label",
444
- "Issues","Improvement","Hallucination Detected"],
445
- wrap=True)
446
 
447
  with gr.Tab("Sheet 1 β€” Groq"): s1_out = gr.Dataframe()
448
  with gr.Tab("Sheet 2 β€” Mistral"): s2_out = gr.Dataframe()
449
  with gr.Tab("Sheet 3 β€” Gemini"): s3_out = gr.Dataframe()
450
  with gr.Tab("Sheet 4 β€” Consolidated"): s4_out = gr.Dataframe()
451
- with gr.Tab("RQ Mismatch"): mismatch_out = gr.Dataframe()
452
  with gr.Tab("Downloads"):
453
  dl_out = gr.File(label="All sheet CSVs + topics.json",
454
  file_count="multiple")
455
 
456
- file_in.change(_preview, inputs=[file_in], outputs=[preview_out])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
  run_btn.click(
459
  _run,
460
- inputs=[file_in, groq_in, mistral_in, gemini_in, trials_in, optimize_in],
 
461
  outputs=[
 
462
  summary_out, scatter_out, pareto_out, trial_out, cluster_out,
463
  top_papers_out,
464
  method_chart_out, method_summary_out, extraction_out, per_llm_out,
@@ -466,6 +632,13 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"),
466
  refine_out,
467
  s1_out, s2_out, s3_out, s4_out,
468
  dl_out, mismatch_out,
 
 
 
 
 
 
 
469
  ],
470
  )
471
 
 
1
+ """
2
+ app.py β€” Gradio UI entry point.
3
+ ORIGINAL structure and all tabs preserved.
4
+ NEW: second file upload for methodology CSV, technique sheets 1-4,
5
+ journal cross-tabulation chart + table, technique optimisation log.
6
  """
7
  import os, json
8
  import pandas as pd, numpy as np
 
29
  f"**Usable papers:** {n - max(blanks_t, blanks_a)} / {n}")
30
 
31
 
32
+ def _preview_methodology(file):
33
+ if not file: return "Upload methodology CSV (title, doi, methodology) to enable technique analysis."
34
+ df = pd.read_csv(file.name)
35
+ df.columns = df.columns.str.lower()
36
+ has_t = "title" in df.columns
37
+ has_m = "methodology" in df.columns
38
+ has_d = "doi" in df.columns
39
+ n = len(df)
40
+ ok = "βœ…" if has_t and has_m else "❌"
41
+ return (f"## {ok} Methodology CSV β€” {n} papers\n\n"
42
+ f"| Column | Present |\n|---|---|\n"
43
+ f"| title | {'βœ…' if has_t else '❌'} |\n"
44
+ f"| doi | {'βœ…' if has_d else '⚠ optional'} |\n"
45
+ f"| methodology | {'βœ…' if has_m else '❌'} |\n\n"
46
+ f"Journals will be auto-detected from DOI + title.")
47
+
48
+
49
+ # ── Original helper builders ─────────────────────────────────────────────────
50
  def _top_papers_df(top_papers: dict) -> pd.DataFrame:
51
  rows = []
52
  for cid in sorted(top_papers.keys()):
 
83
 
84
 
85
  def _extraction_pipeline_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
 
 
 
 
 
86
  rows = []
87
  for cid in sorted(methodology_data.keys()):
88
  md = methodology_data[cid]
89
  label = interps.get(cid, {}).get("label", f"Cluster {cid}")
90
  scan = md.get("regex_scan", {})
 
 
91
  for item in md.get("methodologies", []) + md.get("techniques", []):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  name = item["name"]
93
+ regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[])
94
+ matched = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "β€”"
95
+ rows.append({"Cluster": cid, "Label": label, "Item": name,
96
+ "Type": "Method" if item in md.get("methodologies",[]) else "Technique",
97
+ "Regex Match":matched, "Regex Fired": "βœ…" if regex_hits else "❌",
98
+ "LLM Votes": item["llm_votes"], "Agreement": item["agreement"],
99
+ "Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","β€”"),
100
+ "Gate Passed":"βœ… ACCEPTED"})
101
+ for item in md.get("rejected_methods",[]) + md.get("rejected_techniques",[]):
102
+ name = item["name"]
103
+ regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[])
104
+ matched = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "β€”"
105
+ rows.append({"Cluster": cid, "Label": label, "Item": name,
106
+ "Type": "Method" if item in md.get("rejected_methods",[]) else "Technique",
107
+ "Regex Match":matched, "Regex Fired": "βœ…" if regex_hits else "❌",
108
+ "LLM Votes": item["llm_votes"], "Agreement": item["agreement"],
109
+ "Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","β€”"),
110
+ "Gate Passed":"❌ REJECTED (single LLM)"})
111
  return pd.DataFrame(rows) if rows else pd.DataFrame()
112
 
113
 
114
  def _per_llm_methodology_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
 
115
  rows = []
116
  for cid in sorted(methodology_data.keys()):
117
+ md = methodology_data[cid]
118
+ label = interps.get(cid,{}).get("label", f"Cluster {cid}")
119
+ raw = md.get("llm_raw",{})
 
120
  def _fmt(r, key):
121
+ return " | ".join(f"{i['name']} ({i.get('pct',0)}%)" for i in r.get(key,[])) or "β€”"
122
+ rows.append({"Cluster": cid, "Label": label,
123
+ "Groq Methods": _fmt(raw.get("groq",{}), "methodologies"),
124
+ "Mistral Methods": _fmt(raw.get("mistral",{}), "methodologies"),
125
+ "Gemini Methods": _fmt(raw.get("gemini",{}), "methodologies"),
126
+ "Groq Techniques": _fmt(raw.get("groq",{}), "techniques"),
127
+ "Mistral Techniques": _fmt(raw.get("mistral",{}), "techniques"),
128
+ "Gemini Techniques": _fmt(raw.get("gemini",{}), "techniques"),
129
+ "Groq E/T/M": f"{raw.get('groq',{}).get('empirical_pct',0)}/"
130
+ f"{raw.get('groq',{}).get('theoretical_pct',0)}/"
131
+ f"{raw.get('groq',{}).get('mixed_pct',0)}",
132
+ "Mistral E/T/M": f"{raw.get('mistral',{}).get('empirical_pct',0)}/"
133
+ f"{raw.get('mistral',{}).get('theoretical_pct',0)}/"
134
+ f"{raw.get('mistral',{}).get('mixed_pct',0)}",
135
+ "Gemini E/T/M": f"{raw.get('gemini',{}).get('empirical_pct',0)}/"
136
+ f"{raw.get('gemini',{}).get('theoretical_pct',0)}/"
137
+ f"{raw.get('gemini',{}).get('mixed_pct',0)}",
 
 
 
 
 
 
138
  })
139
  return pd.DataFrame(rows)
140
 
141
 
142
  def _regex_hits_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
 
 
 
 
143
  rows = []
144
  for cid in sorted(methodology_data.keys()):
145
+ md = methodology_data[cid]
146
+ label = interps.get(cid,{}).get("label", f"Cluster {cid}")
147
+ scan = md.get("regex_scan",{})
148
+ for category, hits in scan.get("methods",{}).items():
 
149
  for h in hits:
150
+ rows.append({"Cluster": cid, "Label": label, "Bank": "Methodology",
151
+ "Pattern Category": category, "Matched Text": h["match"],
152
+ "Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
153
+ for category, hits in scan.get("techniques",{}).items():
 
 
154
  for h in hits:
155
+ rows.append({"Cluster": cid, "Label": label, "Bank": "Technique",
156
+ "Pattern Category": category, "Matched Text": h["match"],
157
+ "Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
 
 
158
  return pd.DataFrame(rows) if rows else pd.DataFrame()
159
 
160
 
 
166
  empirical.append(md.get("empirical_pct", 0))
167
  theoretical.append(md.get("theoretical_pct", 0))
168
  mixed.append(md.get("mixed_pct", 0))
 
169
  fig = go.Figure()
170
  fig.add_trace(go.Bar(name="Empirical %", x=labels_list, y=empirical, marker_color="#3dba7a"))
171
  fig.add_trace(go.Bar(name="Theoretical %", x=labels_list, y=theoretical, marker_color="#5b9cf6"))
172
  fig.add_trace(go.Bar(name="Mixed %", x=labels_list, y=mixed, marker_color="#f5a623"))
173
+ fig.update_layout(barmode="stack", template="plotly_dark", height=420,
 
174
  paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
175
  title="Research Orientation per Cluster β€” Averaged across Groq + Mistral + Gemini",
176
  xaxis_title="Cluster", yaxis_title="Percentage (%)",
177
+ font=dict(size=11), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-35)
 
 
178
  return fig
179
 
180
 
181
+ def _refinement_df(rl: list) -> pd.DataFrame:
182
+ if not rl:
183
+ return pd.DataFrame(columns=["Cluster","Iteration","Old Label","New Label",
184
+ "Issues","Improvement","Hallucination Detected"])
185
+ return pd.DataFrame([{
186
+ "Cluster": r["cluster"], "Iteration": r["iteration"],
187
+ "Old Label": r["old_label"], "New Label": r["new_label"],
188
+ "Issues": "; ".join(r.get("issues",[])),
189
+ "Improvement": r["improvement_score"],
190
+ "Hallucination Detected": r["hallucination_detected"],
191
+ } for r in rl])
192
+
193
+
194
  def _regex_pattern_info() -> str:
195
  m_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in METHODOLOGY_PATTERNS.items())
196
  t_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in TECHNIQUE_PATTERNS.items())
197
  return (
198
+ "### How Cluster Methodology Extraction Works\n\n"
199
+ "**Step 1 β€” Regex Pre-Scan:** Two compiled pattern banks run against representative "
200
+ "abstracts. Every match recorded with exact character span, matched text, paper number.\n\n"
201
+ "**Step 2 β€” 3-LLM Council:** Groq, Mistral, Gemini each receive regex evidence + abstracts. "
202
+ "Each LLM confirms/rejects regex hits and adds any missed methods/techniques.\n\n"
203
+ "**Step 3 β€” β‰₯2-LLM Gate:** Only items named by β‰₯2 LLMs survive. Percentages averaged.\n\n"
204
+ "**Step 4 β€” Orientation:** Empirical/Theoretical/Mixed averaged across 3 LLMs.\n\n"
205
+ "---\n\n#### Methodology Bank\n" + m_list +
206
+ "\n\n#### Technique Bank\n" + t_list)
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
 
209
+ # ── NEW helpers for methodology-CSV pipeline ─────────────────────────────────
210
+ def _tech_sheet_df(sheet_rows: list) -> pd.DataFrame:
211
+ return pd.DataFrame(sheet_rows) if sheet_rows else pd.DataFrame()
212
+
213
+
214
+ def _tech_llm_pct_chart(comp_sheets: dict) -> go.Figure:
215
+ """
216
+ Grouped bar: for each technique, show the % of papers it was found in
217
+ by each of the 3 LLMs (Groq, Mistral, Gemini) + Consolidated.
218
+ """
219
+ s1 = comp_sheets.get(1, [])
220
+ s2 = comp_sheets.get(2, [])
221
+ s3 = comp_sheets.get(3, [])
222
+ s4 = comp_sheets.get(4, [])
223
+
224
+ def _freq(rows):
225
+ counts = {}
226
+ n = len(rows) or 1
227
+ for row in rows:
228
+ for t in (row.get("techniques","") or "").split(", "):
229
+ t = t.strip().title()
230
+ if t and t != "β€”":
231
+ counts[t] = counts.get(t,0) + 1
232
+ return {k: round(v/n*100) for k,v in counts.items()}
233
+
234
+ f1 = _freq(s1); f2 = _freq(s2); f3 = _freq(s3); f4 = _freq(s4)
235
+ all_techs = sorted(set(f1)|set(f2)|set(f3)|set(f4))
236
+
237
+ fig = go.Figure()
238
+ fig.add_trace(go.Bar(name="Groq", x=all_techs, y=[f1.get(t,0) for t in all_techs], marker_color="#5b9cf6"))
239
+ fig.add_trace(go.Bar(name="Mistral", x=all_techs, y=[f2.get(t,0) for t in all_techs], marker_color="#f5a623"))
240
+ fig.add_trace(go.Bar(name="Gemini", x=all_techs, y=[f3.get(t,0) for t in all_techs], marker_color="#a855f7"))
241
+ fig.add_trace(go.Bar(name="Consolidated", x=all_techs, y=[f4.get(t,0) for t in all_techs], marker_color="#3dba7a"))
242
+ fig.update_layout(barmode="group", template="plotly_dark", height=480,
243
+ paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
244
+ title="Computational Technique Frequency β€” % of Papers per LLM (Groq / Mistral / Gemini / Consolidated)",
245
+ xaxis_title="Technique", yaxis_title="% of papers",
246
+ font=dict(size=10), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-40)
247
+ return fig
248
+
249
+
250
+ def _journal_crosstab_chart(journal_crosstab: dict) -> go.Figure:
251
+ """
252
+ Grouped bar: for each technique, show % usage per journal.
253
+ Journals on x-axis, techniques as bar groups.
254
+ """
255
+ ct = journal_crosstab.get("consolidated", {})
256
+ journals = journal_crosstab.get("journals", [])
257
+ techniques= journal_crosstab.get("techniques", [])
258
+
259
+ if not journals or not techniques:
260
+ fig = go.Figure()
261
+ fig.update_layout(template="plotly_dark", title="No journal data available",
262
+ paper_bgcolor="#0d1117")
263
+ return fig
264
+
265
+ COLORS = ["#5b9cf6","#3dba7a","#f5a623","#e04d4d","#a855f7","#06b6d4",
266
+ "#f97316","#84cc16","#ec4899","#14b8a6","#8b5cf6","#ef4444"]
267
+
268
+ fig = go.Figure()
269
+ for i, tech in enumerate(techniques[:15]): # cap at 15 techniques for readability
270
+ pcts = [ct.get(j,{}).get(tech, 0) for j in journals]
271
+ fig.add_trace(go.Bar(name=tech, x=journals, y=pcts,
272
+ marker_color=COLORS[i % len(COLORS)]))
273
+
274
+ fig.update_layout(barmode="group", template="plotly_dark", height=500,
275
+ paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
276
+ title="Computational Technique Usage β€” Cross-Tabulation by Journal (%)",
277
+ xaxis_title="Journal", yaxis_title="% of papers using technique",
278
+ font=dict(size=10), legend=dict(orientation="h", y=1.15), xaxis_tickangle=-20)
279
+ return fig
280
+
281
+
282
+ def _journal_crosstab_df(journal_crosstab: dict) -> pd.DataFrame:
283
+ ct = journal_crosstab.get("consolidated", {})
284
+ journals = journal_crosstab.get("journals", [])
285
+ techniques= journal_crosstab.get("techniques", [])
286
+ paper_counts = journal_crosstab.get("journal_paper_counts", {})
287
+ rows = []
288
+ for j in journals:
289
+ row = {"Journal": j, "N Papers": paper_counts.get(j,0)}
290
+ for t in techniques:
291
+ row[t] = f"{ct.get(j,{}).get(t,0)}%"
292
+ rows.append(row)
293
+ return pd.DataFrame(rows)
294
+
295
+
296
+ def _tech_opt_df(opt_log: list) -> pd.DataFrame:
297
+ if not opt_log:
298
+ return pd.DataFrame(columns=["Technique","Refined Name","Hallucination",
299
+ "High Variance","Groq %","Mistral %","Gemini %",
300
+ "Suggestion","Split Into","Merge With"])
301
  return pd.DataFrame([{
302
+ "Technique": r["technique"],
303
+ "Refined Name": r["refined_name"],
304
+ "Hallucination": r["is_hallucination"],
305
+ "High Variance": r["high_variance"],
306
+ "Groq %": r["pct_groq"],
307
+ "Mistral %": r["pct_mistral"],
308
+ "Gemini %": r["pct_gemini"],
309
+ "Suggestion": r["suggestion"],
310
+ "Split Into": r["split_into"],
311
+ "Merge With": r["merge_with"],
312
+ } for r in opt_log])
313
+
314
+
315
+ def _per_llm_freq_df(journal_crosstab: dict) -> pd.DataFrame:
316
+ """Per-LLM technique frequency across all papers in methodology CSV."""
317
+ per_llm = journal_crosstab.get("per_llm_freq", {})
318
+ techniques = sorted(set(t for d in per_llm.values() for t in d.keys()))
319
+ rows = []
320
+ for t in techniques:
321
+ rows.append({
322
+ "Technique": t,
323
+ "Groq %": per_llm.get("Groq",{}).get(t, 0),
324
+ "Mistral %": per_llm.get("Mistral",{}).get(t, 0),
325
+ "Gemini %": per_llm.get("Gemini",{}).get(t, 0),
326
+ "Variance": round(max(
327
+ per_llm.get("Groq",{}).get(t,0),
328
+ per_llm.get("Mistral",{}).get(t,0),
329
+ per_llm.get("Gemini",{}).get(t,0),
330
+ ) - min(
331
+ per_llm.get("Groq",{}).get(t,0),
332
+ per_llm.get("Mistral",{}).get(t,0),
333
+ per_llm.get("Gemini",{}).get(t,0),
334
+ )),
335
+ })
336
+ return pd.DataFrame(rows).sort_values("Groq %", ascending=False)
337
 
338
 
339
  # ── Pipeline runner ──────────────────────────────────────────────────────────
340
+ def _run(corpus_file, method_file, gk, mk, gek, n_trials, n_optimize,
341
  progress=gr.Progress(track_tqdm=True)):
342
+ if not corpus_file: raise gr.Error("Upload a Scopus corpus CSV first.")
343
  gk = gk.strip() or os.getenv("GROQ_API_KEY","")
344
  mk = mk.strip() or os.getenv("MISTRAL_API_KEY","")
345
  gek = gek.strip() or os.getenv("GEMINI_API_KEY","")
346
  if not all([gk,mk,gek]): raise gr.Error("All 3 API keys required.")
347
 
348
+ method_path = method_file.name if method_file else None
349
+
350
  progress(0.05, desc="πŸ“₯ Loading CSV…")
351
+ progress(0.10, desc="πŸ”¬ Embedding corpus with SPECTER-2…")
352
+ r = run_pipeline(corpus_file.name, gk, mk, gek,
353
+ int(n_trials), int(n_optimize), method_path)
354
  if r.get("error"): raise gr.Error(r["error"])
 
355
  progress(0.85, desc="πŸ“Š Building outputs…")
356
+
357
+ td, interps = r["topic_data"], r.get("interpretations",{})
358
  disc, met = td["discipline"], td["metrics"]
359
+ ar = r.get("agreement_rates",{})
360
  rl = r.get("refinement_log", [])
361
 
362
  def _s(ok): return "βœ… PASS" if ok else "❌ FAIL"
 
373
  f"**Optimization passes:** {n_optimize} Β· **Labels refined:** {len(rl)}"
374
  )
375
 
376
+ # UMAP scatter
377
  u2d = np.array(td["umap_2d"])
378
  sdf = pd.DataFrame({"UMAP-1":u2d[:,0],"UMAP-2":u2d[:,1],
379
  "Cluster":[str(l) for l in td["labels"]],
 
384
  fig.update_layout(template="plotly_dark", height=500,
385
  paper_bgcolor="#0d1117", plot_bgcolor="#161b22", font=dict(size=11))
386
 
387
+ # Trial log + Pareto
388
  tl = pd.DataFrame(td["trial_log"])
389
  tl_cols = [c for c in ["trial","discipline_pass","n_clusters","persistence",
390
  "dbcv","max_mass_pct","min_size","n_noise"] if c in tl.columns]
 
419
  sp = r.get("sheet_paths",{})
420
  mdf = pd.DataFrame(r.get("mismatch_table",[]))
421
 
422
+ md_data = r.get("methodology_data",{})
423
+ top_papers_df = _top_papers_df(r.get("top_papers",{}))
424
+ method_sum_df = _methodology_summary_df(md_data, interps)
425
+ method_chart = _methodology_bar_chart(md_data, interps)
426
+ extraction_df = _extraction_pipeline_df(md_data, interps)
427
+ per_llm_meth_df = _per_llm_methodology_df(md_data, interps)
428
+ regex_hits_df = _regex_hits_df(md_data, interps)
429
+ pattern_info = _regex_pattern_info()
430
+ refine_df = _refinement_df(rl)
431
+
432
+ # ── NEW: methodology-CSV outputs ─────────────────────────────────────────
433
+ comp_sheets = r.get("comp_technique_sheets", {1:[], 2:[], 3:[], 4:[]})
434
+ jct = r.get("journal_crosstab", {})
435
+ tech_opt_log = r.get("technique_opt_log", [])
436
+
437
+ tech_s1 = _tech_sheet_df(comp_sheets.get(1,[]))
438
+ tech_s2 = _tech_sheet_df(comp_sheets.get(2,[]))
439
+ tech_s3 = _tech_sheet_df(comp_sheets.get(3,[]))
440
+ tech_s4 = _tech_sheet_df(comp_sheets.get(4,[]))
441
+
442
+ tech_llm_chart = _tech_llm_pct_chart(comp_sheets)
443
+ jct_chart = _journal_crosstab_chart(jct)
444
+ jct_df = _journal_crosstab_df(jct)
445
+ per_llm_freq_df = _per_llm_freq_df(jct)
446
+ tech_opt_df = _tech_opt_df(tech_opt_log)
447
 
448
  progress(1.0, desc="βœ… Done!")
449
  dl_files = [f for f in [sp.get(1),sp.get(2),sp.get(3),sp.get(4),r.get("json_path")] if f]
450
 
451
+ return (
452
+ # ── original outputs (order preserved) ───────────────────────────────
453
+ summary, fig, pfig, tl_show, cdf,
454
+ top_papers_df,
455
+ method_chart, method_sum_df, extraction_df, per_llm_meth_df,
456
+ regex_hits_df, pattern_info,
457
+ refine_df,
458
+ s1, s2, s3, s4,
459
+ dl_files if dl_files else None,
460
+ mdf,
461
+ # ── new outputs ───────────────────────────────────────────────────────
462
+ tech_llm_chart,
463
+ tech_s1, tech_s2, tech_s3, tech_s4,
464
+ per_llm_freq_df,
465
+ jct_chart,
466
+ jct_df,
467
+ tech_opt_df,
468
+ )
469
 
470
 
471
+ # ── UI ────────────────────────────────────────────────────────────────────────
472
  css = ".gradio-container{background:#0d1117!important;color:#c9d1d9!important}" \
473
  "footer{display:none!important}"
474
 
 
477
  gr.Markdown("# πŸ“ SPECTER-2 Topic Analyzer")
478
 
479
  with gr.Row():
480
+ # ── Left sidebar ─────────────────────────────────────────────────────
481
  with gr.Column(scale=1):
482
+ gr.Markdown("### πŸ“„ Corpus CSV")
483
+ file_in = gr.File(label="Upload Scopus CSV (title + abstract)",
484
+ file_types=[".csv"])
485
  preview_out = gr.Markdown("Upload a CSV to see stats.")
486
+
487
+ gr.Markdown("### πŸ”¬ Methodology CSV *(optional)*")
488
+ method_file_in = gr.File(label="Upload Methodology CSV (title, doi, methodology)",
489
+ file_types=[".csv"])
490
+ method_preview = gr.Markdown("Upload methodology CSV to enable technique analysis.")
491
+
492
+ gr.Markdown("### πŸ”‘ API Keys")
493
  groq_in = gr.Textbox(label="Groq API Key", type="password",
494
  placeholder="or set GROQ_API_KEY env var")
495
  mistral_in = gr.Textbox(label="Mistral API Key", type="password",
496
  placeholder="or set MISTRAL_API_KEY env var")
497
  gemini_in = gr.Textbox(label="Gemini API Key", type="password",
498
  placeholder="or set GEMINI_API_KEY env var")
499
+
500
+ gr.Markdown("### βš™ Parameters")
501
+ trials_in = gr.Slider(10, 100, 50, step=5, label="Optuna Trials")
502
  optimize_in = gr.Slider(1, 5, 1, step=1,
503
  label="πŸ” Optimization Passes",
504
+ info="Pass 1 = no refinement. 2–5 = LLM critic audits topic labels "
505
+ "AND technique labels for hallucinations + improvements.")
506
+ run_btn = gr.Button("β–Ά Run Full Pipeline", variant="primary", size="lg")
507
 
508
+ # ── Main panel ────────────────────────────────────────────────────────
509
  with gr.Column(scale=3):
510
  with gr.Tabs():
511
 
512
+ # ── original tabs (order / content unchanged) ─────────────────
513
  with gr.Tab("Summary"):
514
  summary_out = gr.Markdown()
515
 
 
527
 
528
  with gr.Tab("πŸ—ž Top 3 Papers"):
529
  gr.Markdown("### Top 3 Representative Papers per Cluster\n"
530
+ "Ranked by cosine similarity to cluster centroid "
531
  "in SPECTER-2 embedding space.")
532
  top_papers_out = gr.Dataframe(
533
  headers=["Cluster","Label","Rank","Title","Abstract Snippet"],
534
  wrap=True)
535
 
536
+ with gr.Tab("πŸ”¬ Cluster Methodology"):
537
+ gr.Markdown("### Cluster-Level Methodology β€” 3-LLM Council\n"
538
+ "Derived from representative abstracts per cluster. "
539
+ "β‰₯2-LLM gate applied.")
540
+ method_chart_out = gr.Plot()
541
  method_summary_out = gr.Dataframe(wrap=True)
542
 
543
+ with gr.Tab("βš™ Cluster Extraction Pipeline"):
544
+ gr.Markdown("### Full Regex + LLM Extraction Trace (per cluster)")
 
 
 
545
  extraction_out = gr.Dataframe(wrap=True)
546
 
547
+ with gr.Tab("πŸ€– Cluster Per-LLM Votes"):
548
+ gr.Markdown("### Raw Per-LLM Methodology Votes (per cluster)")
 
 
549
  per_llm_out = gr.Dataframe(wrap=True)
550
 
551
+ with gr.Tab("πŸ” Cluster Regex Hits"):
552
+ gr.Markdown("### Regex Pattern Matches (per cluster)\n"
553
+ "Every match with exact character span and paper number.")
554
+ regex_hits_out = gr.Dataframe(wrap=True)
555
+ regex_info_out = gr.Markdown()
 
 
556
 
557
  with gr.Tab("πŸ” Refinement Log"):
558
+ gr.Markdown("### Topic Label Optimization Log\n"
559
+ "Changes made by LLM critic per optimization pass.")
560
+ refine_out = gr.Dataframe(wrap=True)
 
 
 
 
 
 
561
 
562
  with gr.Tab("Sheet 1 β€” Groq"): s1_out = gr.Dataframe()
563
  with gr.Tab("Sheet 2 β€” Mistral"): s2_out = gr.Dataframe()
564
  with gr.Tab("Sheet 3 β€” Gemini"): s3_out = gr.Dataframe()
565
  with gr.Tab("Sheet 4 β€” Consolidated"): s4_out = gr.Dataframe()
566
+ with gr.Tab("RQ Mismatch"): mismatch_out = gr.Dataframe()
567
  with gr.Tab("Downloads"):
568
  dl_out = gr.File(label="All sheet CSVs + topics.json",
569
  file_count="multiple")
570
 
571
+ # ── NEW tabs: methodology CSV pipeline ────────────────────────
572
+ with gr.Tab("πŸ’» Comp. Techniques β€” LLM % Chart"):
573
+ gr.Markdown("### Computational Technique Frequency β€” Methodology CSV\n"
574
+ "For each technique, shows the % of papers it was extracted "
575
+ "from by each of the 3 LLMs independently + the consolidated "
576
+ "result (β‰₯2-LLM gate). Bars grouped by technique.")
577
+ tech_llm_chart_out = gr.Plot()
578
+
579
+ with gr.Tab("πŸ’» Tech Sheet 1 β€” Groq"):
580
+ gr.Markdown("### Groq raw technique extraction β€” one row per paper")
581
+ tech_s1_out = gr.Dataframe(wrap=True)
582
+
583
+ with gr.Tab("πŸ’» Tech Sheet 2 β€” Mistral"):
584
+ gr.Markdown("### Mistral raw technique extraction β€” one row per paper")
585
+ tech_s2_out = gr.Dataframe(wrap=True)
586
+
587
+ with gr.Tab("πŸ’» Tech Sheet 3 β€” Gemini"):
588
+ gr.Markdown("### Gemini raw technique extraction β€” one row per paper")
589
+ tech_s3_out = gr.Dataframe(wrap=True)
590
+
591
+ with gr.Tab("πŸ’» Tech Sheet 4 β€” Consolidated"):
592
+ gr.Markdown("### Consolidated techniques β€” β‰₯2-LLM agreement, one row per paper")
593
+ tech_s4_out = gr.Dataframe(wrap=True)
594
+
595
+ with gr.Tab("πŸ“Š Tech Frequency by LLM"):
596
+ gr.Markdown("### Per-LLM Technique Frequency Table\n"
597
+ "% of all papers where each LLM extracted each technique. "
598
+ "High variance = LLMs disagree β†’ optimization flag.")
599
+ per_llm_freq_out = gr.Dataframe(wrap=True)
600
+
601
+ with gr.Tab("πŸ—‚ Journal Cross-Tabulation"):
602
+ gr.Markdown("### Technique Γ— Journal Cross-Tabulation\n"
603
+ "Rows = journals auto-detected from DOI/title. "
604
+ "Columns = consolidated techniques. "
605
+ "Values = % of papers in that journal using the technique.\n\n"
606
+ "**Journals detected:** MISQ, JAIS, ISR, JMIS, PAJAIS, "
607
+ "ECIS, ICIS, Other.")
608
+ jct_chart_out = gr.Plot()
609
+ jct_df_out = gr.Dataframe(wrap=True)
610
+
611
+ with gr.Tab("πŸ”§ Technique Optimization"):
612
+ gr.Markdown("### Technique Label Improvement Suggestions\n"
613
+ "Groq critic flags: hallucination, high inter-LLM variance "
614
+ "(>15% gap), split/merge recommendations.\n"
615
+ "Only runs when Optimization Passes β‰₯ 2.")
616
+ tech_opt_out = gr.Dataframe(wrap=True)
617
+
618
+ # ── Wire callbacks ────────────────────────────────────────────────────────
619
+ file_in.change(_preview, inputs=[file_in], outputs=[preview_out])
620
+ method_file_in.change(_preview_methodology, inputs=[method_file_in], outputs=[method_preview])
621
 
622
  run_btn.click(
623
  _run,
624
+ inputs=[file_in, method_file_in, groq_in, mistral_in, gemini_in,
625
+ trials_in, optimize_in],
626
  outputs=[
627
+ # original
628
  summary_out, scatter_out, pareto_out, trial_out, cluster_out,
629
  top_papers_out,
630
  method_chart_out, method_summary_out, extraction_out, per_llm_out,
 
632
  refine_out,
633
  s1_out, s2_out, s3_out, s4_out,
634
  dl_out, mismatch_out,
635
+ # new
636
+ tech_llm_chart_out,
637
+ tech_s1_out, tech_s2_out, tech_s3_out, tech_s4_out,
638
+ per_llm_freq_out,
639
+ jct_chart_out,
640
+ jct_df_out,
641
+ tech_opt_out,
642
  ],
643
  )
644