"""analytics.py — Load sample results and build Plotly figures for the Analytics tab.""" import json import os from collections import Counter, defaultdict import plotly.graph_objects as go from plotly.subplots import make_subplots # ── Data loading ────────────────────────────────────────────────────────────── _DIR = os.path.dirname(__file__) DATASETS = { "ICLR 2025": "iclr2025_v2_results.jsonl", "ICML 2025": "icml2025_v3_results.jsonl", "NeurIPS 2025": "neurips2025_v3_results.jsonl", } LABEL_COLORS = { "System 1": "#ef4444", "Mixed": "#f59e0b", "System 2": "#22c55e", } CONF_COLORS = { "ICLR 2025": "#6366f1", "ICML 2025": "#f59e0b", "NeurIPS 2025": "#22c55e", } def _load_results(fname: str) -> list: path = os.path.join(_DIR, fname) if not os.path.exists(path): return [] out = [] for line in open(path): line = line.strip() if line: try: out.append(json.loads(line)) except Exception: pass return out def load_all() -> dict: """Returns {conf: {"papers": [...], "reviews": [...], "metas": [...]}}""" data = {} for conf, fname in DATASETS.items(): papers = _load_results(fname) reviews = [] for p in papers: for r in p.get("review_ratings", []): if r.get("label"): reviews.append({**r, "_decision": p.get("decision", ""), "_conf": conf}) metas = [] for p in papers: m = p.get("metareview_rating") if m and m.get("label"): metas.append({**m, "_decision": p.get("decision", ""), "_conf": conf}) data[conf] = {"papers": papers, "reviews": reviews, "metas": metas} return data # ── Figure builders ─────────────────────────────────────────────────────────── def fig_label_distribution(data: dict) -> go.Figure: """Grouped bar: label distribution per conference.""" labels_order = ["System 1", "Mixed", "System 2"] confs = list(data.keys()) fig = go.Figure() for lbl in labels_order: y_vals = [] for conf in confs: reviews = data[conf]["reviews"] if not reviews: y_vals.append(0) continue cnt = sum(1 for r in reviews if r["label"] == lbl) y_vals.append(round(cnt / len(reviews) * 100, 1)) fig.add_trace(go.Bar( name=lbl, x=confs, y=y_vals, marker_color=LABEL_COLORS.get(lbl, "#888"), text=[f"{v}%" for v in y_vals], textposition="outside", )) fig.update_layout( title="Review Label Distribution by Conference", barmode="group", yaxis=dict(title="% of reviews", range=[0, 75]), legend=dict(orientation="h", y=-0.2), height=420, margin=dict(t=50, b=80), ) return fig def fig_rqs_by_decision(data: dict) -> go.Figure: """Grouped bar: mean RQS per decision tier per conference.""" decision_map = { "Accept (Oral)": "Oral", "Accept (oral)": "Oral", "Accept (Spotlight)": "Spotlight", "Accept (spotlight)": "Spotlight", "Accept (spotlight poster)": "Spotlight", "Accept (Poster)": "Poster", "Accept (poster)": "Poster", } tiers = ["Oral", "Spotlight", "Poster"] confs = list(data.keys()) fig = go.Figure() for conf in confs: by_tier = defaultdict(list) for r in data[conf]["reviews"]: tier = decision_map.get(r["_decision"]) rqs = r.get("overall_reasoning_quality_score") if tier and rqs: by_tier[tier].append(float(rqs)) y_vals = [round(sum(by_tier[t]) / len(by_tier[t]), 2) if by_tier[t] else None for t in tiers] counts = [len(by_tier[t]) for t in tiers] fig.add_trace(go.Bar( name=conf, x=tiers, y=y_vals, marker_color=CONF_COLORS[conf], text=[f"{v:.2f}
(n={c})" if v else "" for v, c in zip(y_vals, counts)], textposition="outside", )) fig.update_layout( title="Mean Reasoning Quality Score by Decision Tier", barmode="group", yaxis=dict(title="RQS (1–5)", range=[0, 4]), legend=dict(orientation="h", y=-0.2), height=420, margin=dict(t=50, b=80), ) return fig def fig_s1_s2_scatter(data: dict) -> go.Figure: """Scatter: S1 score vs S2 score, colored by label, one trace per conf.""" fig = go.Figure() for conf in data: reviews = data[conf]["reviews"] for lbl in ["System 1", "Mixed", "System 2", "Non-evaluative"]: subset = [r for r in reviews if r.get("label") == lbl and r.get("system1_score") and r.get("system2_score")] if not subset: continue fig.add_trace(go.Scatter( x=[r["system1_score"] for r in subset], y=[r["system2_score"] for r in subset], mode="markers", name=f"{conf} — {lbl}", marker=dict(color=LABEL_COLORS.get(lbl, "#888"), size=5, opacity=0.6), legendgroup=lbl, showlegend=True, )) # diagonal reference line fig.add_shape(type="line", x0=1, y0=1, x1=5, y1=5, line=dict(color="gray", dash="dash", width=1)) fig.update_layout( title="System 1 vs System 2 Score (all reviews)", xaxis=dict(title="System 1 Score", range=[0.8, 5.2]), yaxis=dict(title="System 2 Score", range=[0.8, 5.2]), height=480, margin=dict(t=50, b=40), ) return fig def fig_bias_heatmap(data: dict) -> go.Figure: """Heatmap: bias frequency (% of reviews) per conference.""" bias_order = [ "Checklist Inflation", "Representativeness Heuristic", "Question Substitution", "Conclusion-First Justification", "Overconfidence", "Narrative Fallacy", "Authority Substitution", "Confirmation Bias", ] confs = list(data.keys()) z = [] text = [] for conf in confs: reviews = data[conf]["reviews"] n = len(reviews) or 1 row = [] trow = [] for b in bias_order: cnt = sum(1 for r in reviews if b in r.get("bias_diagnostics", [])) pct = round(cnt / n * 100, 1) row.append(pct) trow.append(f"{pct}%
({cnt})") z.append(row) text.append(trow) fig = go.Figure(go.Heatmap( z=z, x=bias_order, y=confs, text=text, texttemplate="%{text}", colorscale="YlOrRd", showscale=True, colorbar=dict(title="% reviews"), )) fig.update_layout( title="Bias Diagnostics Frequency (% of reviews per conference)", xaxis=dict(tickangle=-30), height=320, margin=dict(t=50, b=120), ) return fig def fig_rqs_distribution(data: dict) -> go.Figure: """Violin: RQS distribution per conference.""" fig = go.Figure() for conf in data: rqs_vals = [float(r["overall_reasoning_quality_score"]) for r in data[conf]["reviews"] if r.get("overall_reasoning_quality_score")] fig.add_trace(go.Violin( y=rqs_vals, name=conf, box_visible=True, meanline_visible=True, fillcolor=CONF_COLORS[conf], opacity=0.7, line_color="white", )) fig.update_layout( title="RQS Distribution by Conference", yaxis=dict(title="Overall Reasoning Quality Score (1–5)"), height=400, margin=dict(t=50, b=40), ) return fig # ── Summary text ────────────────────────────────────────────────────────────── def build_summary(data: dict) -> str: lines = [] for conf in data: reviews = data[conf]["reviews"] if not reviews: continue n = len(reviews) lc = Counter(r["label"] for r in reviews) rqs = [float(r["overall_reasoning_quality_score"]) for r in reviews if r.get("overall_reasoning_quality_score")] mean_rqs = sum(rqs) / len(rqs) if rqs else 0 lines.append(f"**{conf}** — {n} reviews · RQS mean {mean_rqs:.2f} · " f"Mixed {lc.get('Mixed',0)/n*100:.0f}% · " f"S1 {lc.get('System 1',0)/n*100:.0f}% · " f"S2 {lc.get('System 2',0)/n*100:.0f}%") return "\n\n".join(lines) FINDINGS = """ ### Key Findings *100 papers × 3 conferences, ~1,150 reviews, rated by claude-sonnet-4-6. Papers sampled by stratified random sampling proportional to acceptance tier (Oral / Spotlight / Poster) within each venue.* 1. **ICML and NeurIPS reviewers show more System 2 tendency (~23–26%) than ICLR (16%).** ICML's structured fields (*Claims and Evidence*, *Theoretical Claims*, *Experimental Designs*) appear to scaffold more explicit, decomposed reasoning. 2. **Despite different formats and communities, the overall analytical depth of peer review is remarkably uniform** (RQS 2.80–2.94 / 5), suggesting a field-wide ceiling rather than venue-specific culture. 3. **Decision tier does not predict review quality.** Oral-paper reviews are not systematically stronger than Poster reviews (differences < 0.2 RQS points). Reviewers do not write more analytically for papers they rate highly. --- > *We are not against AI review. We are against flawed reasoning behind review.* """