Spaces:

nuojohnchen
/

Kahneman4Review

Running

File size: 10,393 Bytes

"""analytics.py — Load sample results and build Plotly figures for the Analytics tab."""

import json
import os
from collections import Counter, defaultdict

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ── Data loading ──────────────────────────────────────────────────────────────

_DIR = os.path.dirname(__file__)

DATASETS = {
    "ICLR 2025": "iclr2025_v2_results.jsonl",
    "ICML 2025": "icml2025_v3_results.jsonl",
    "NeurIPS 2025": "neurips2025_v3_results.jsonl",
}

LABEL_COLORS = {
    "System 1": "#ef4444",
    "Mixed":    "#f59e0b",
    "System 2": "#22c55e",
}

CONF_COLORS = {
    "ICLR 2025":    "#6366f1",
    "ICML 2025":    "#f59e0b",
    "NeurIPS 2025": "#22c55e",
}


def _load_results(fname: str) -> list:
    path = os.path.join(_DIR, fname)
    if not os.path.exists(path):
        return []
    out = []
    for line in open(path):
        line = line.strip()
        if line:
            try:
                out.append(json.loads(line))
            except Exception:
                pass
    return out


def load_all() -> dict:
    """Returns {conf: {"papers": [...], "reviews": [...], "metas": [...]}}"""
    data = {}
    for conf, fname in DATASETS.items():
        papers = _load_results(fname)
        reviews = []
        for p in papers:
            for r in p.get("review_ratings", []):
                if r.get("label"):
                    reviews.append({**r, "_decision": p.get("decision", ""), "_conf": conf})
        metas = []
        for p in papers:
            m = p.get("metareview_rating")
            if m and m.get("label"):
                metas.append({**m, "_decision": p.get("decision", ""), "_conf": conf})
        data[conf] = {"papers": papers, "reviews": reviews, "metas": metas}
    return data


# ── Figure builders ───────────────────────────────────────────────────────────

def fig_label_distribution(data: dict) -> go.Figure:
    """Grouped bar: label distribution per conference."""
    labels_order = ["System 1", "Mixed", "System 2"]
    confs = list(data.keys())

    fig = go.Figure()
    for lbl in labels_order:
        y_vals = []
        for conf in confs:
            reviews = data[conf]["reviews"]
            if not reviews:
                y_vals.append(0)
                continue
            cnt = sum(1 for r in reviews if r["label"] == lbl)
            y_vals.append(round(cnt / len(reviews) * 100, 1))
        fig.add_trace(go.Bar(
            name=lbl,
            x=confs,
            y=y_vals,
            marker_color=LABEL_COLORS.get(lbl, "#888"),
            text=[f"{v}%" for v in y_vals],
            textposition="outside",
        ))

    fig.update_layout(
        title="Review Label Distribution by Conference",
        barmode="group",
        yaxis=dict(title="% of reviews", range=[0, 75]),
        legend=dict(orientation="h", y=-0.2),
        height=420,
        margin=dict(t=50, b=80),
    )
    return fig


def fig_rqs_by_decision(data: dict) -> go.Figure:
    """Grouped bar: mean RQS per decision tier per conference."""
    decision_map = {
        "Accept (Oral)":             "Oral",
        "Accept (oral)":             "Oral",
        "Accept (Spotlight)":        "Spotlight",
        "Accept (spotlight)":        "Spotlight",
        "Accept (spotlight poster)": "Spotlight",
        "Accept (Poster)":           "Poster",
        "Accept (poster)":           "Poster",
    }
    tiers = ["Oral", "Spotlight", "Poster"]
    confs = list(data.keys())

    fig = go.Figure()
    for conf in confs:
        by_tier = defaultdict(list)
        for r in data[conf]["reviews"]:
            tier = decision_map.get(r["_decision"])
            rqs = r.get("overall_reasoning_quality_score")
            if tier and rqs:
                by_tier[tier].append(float(rqs))
        y_vals = [round(sum(by_tier[t]) / len(by_tier[t]), 2) if by_tier[t] else None for t in tiers]
        counts = [len(by_tier[t]) for t in tiers]
        fig.add_trace(go.Bar(
            name=conf,
            x=tiers,
            y=y_vals,
            marker_color=CONF_COLORS[conf],
            text=[f"{v:.2f}<br>(n={c})" if v else "" for v, c in zip(y_vals, counts)],
            textposition="outside",
        ))

    fig.update_layout(
        title="Mean Reasoning Quality Score by Decision Tier",
        barmode="group",
        yaxis=dict(title="RQS (1–5)", range=[0, 4]),
        legend=dict(orientation="h", y=-0.2),
        height=420,
        margin=dict(t=50, b=80),
    )
    return fig


def fig_s1_s2_scatter(data: dict) -> go.Figure:
    """Scatter: S1 score vs S2 score, colored by label, one trace per conf."""
    fig = go.Figure()
    for conf in data:
        reviews = data[conf]["reviews"]
        for lbl in ["System 1", "Mixed", "System 2", "Non-evaluative"]:
            subset = [r for r in reviews if r.get("label") == lbl
                      and r.get("system1_score") and r.get("system2_score")]
            if not subset:
                continue
            fig.add_trace(go.Scatter(
                x=[r["system1_score"] for r in subset],
                y=[r["system2_score"] for r in subset],
                mode="markers",
                name=f"{conf} — {lbl}",
                marker=dict(color=LABEL_COLORS.get(lbl, "#888"), size=5, opacity=0.6),
                legendgroup=lbl,
                showlegend=True,
            ))

    # diagonal reference line
    fig.add_shape(type="line", x0=1, y0=1, x1=5, y1=5,
                  line=dict(color="gray", dash="dash", width=1))
    fig.update_layout(
        title="System 1 vs System 2 Score (all reviews)",
        xaxis=dict(title="System 1 Score", range=[0.8, 5.2]),
        yaxis=dict(title="System 2 Score", range=[0.8, 5.2]),
        height=480,
        margin=dict(t=50, b=40),
    )
    return fig


def fig_bias_heatmap(data: dict) -> go.Figure:
    """Heatmap: bias frequency (% of reviews) per conference."""
    bias_order = [
        "Checklist Inflation",
        "Representativeness Heuristic",
        "Question Substitution",
        "Conclusion-First Justification",
        "Overconfidence",
        "Narrative Fallacy",
        "Authority Substitution",
        "Confirmation Bias",
    ]
    confs = list(data.keys())
    z = []
    text = []
    for conf in confs:
        reviews = data[conf]["reviews"]
        n = len(reviews) or 1
        row = []
        trow = []
        for b in bias_order:
            cnt = sum(1 for r in reviews if b in r.get("bias_diagnostics", []))
            pct = round(cnt / n * 100, 1)
            row.append(pct)
            trow.append(f"{pct}%<br>({cnt})")
        z.append(row)
        text.append(trow)

    fig = go.Figure(go.Heatmap(
        z=z,
        x=bias_order,
        y=confs,
        text=text,
        texttemplate="%{text}",
        colorscale="YlOrRd",
        showscale=True,
        colorbar=dict(title="% reviews"),
    ))
    fig.update_layout(
        title="Bias Diagnostics Frequency (% of reviews per conference)",
        xaxis=dict(tickangle=-30),
        height=320,
        margin=dict(t=50, b=120),
    )
    return fig


def fig_rqs_distribution(data: dict) -> go.Figure:
    """Violin: RQS distribution per conference."""
    fig = go.Figure()
    for conf in data:
        rqs_vals = [float(r["overall_reasoning_quality_score"])
                    for r in data[conf]["reviews"]
                    if r.get("overall_reasoning_quality_score")]
        fig.add_trace(go.Violin(
            y=rqs_vals,
            name=conf,
            box_visible=True,
            meanline_visible=True,
            fillcolor=CONF_COLORS[conf],
            opacity=0.7,
            line_color="white",
        ))
    fig.update_layout(
        title="RQS Distribution by Conference",
        yaxis=dict(title="Overall Reasoning Quality Score (1–5)"),
        height=400,
        margin=dict(t=50, b=40),
    )
    return fig


# ── Summary text ──────────────────────────────────────────────────────────────

def build_summary(data: dict) -> str:
    lines = []
    for conf in data:
        reviews = data[conf]["reviews"]
        if not reviews:
            continue
        n = len(reviews)
        lc = Counter(r["label"] for r in reviews)
        rqs = [float(r["overall_reasoning_quality_score"]) for r in reviews if r.get("overall_reasoning_quality_score")]
        mean_rqs = sum(rqs) / len(rqs) if rqs else 0
        lines.append(f"**{conf}** — {n} reviews · RQS mean {mean_rqs:.2f} · "
                     f"Mixed {lc.get('Mixed',0)/n*100:.0f}% · "
                     f"S1 {lc.get('System 1',0)/n*100:.0f}% · "
                     f"S2 {lc.get('System 2',0)/n*100:.0f}%")
    return "\n\n".join(lines)


FINDINGS = """

### Key Findings



*100 papers × 3 conferences, ~1,150 reviews, rated by claude-sonnet-4-6. Papers sampled by stratified random sampling proportional to acceptance tier (Oral / Spotlight / Poster) within each venue.*



1. **ICML and NeurIPS reviewers show more System 2 tendency (~23–26%) than ICLR (16%).** ICML's structured fields (*Claims and Evidence*, *Theoretical Claims*, *Experimental Designs*) appear to scaffold more explicit, decomposed reasoning.



2. **Despite different formats and communities, the overall analytical depth of peer review is remarkably uniform** (RQS 2.80–2.94 / 5), suggesting a field-wide ceiling rather than venue-specific culture.



3. **Decision tier does not predict review quality.** Oral-paper reviews are not systematically stronger than Poster reviews (differences < 0.2 RQS points). Reviewers do not write more analytically for papers they rate highly.



---



> *We are not against AI review. We are against flawed reasoning behind review.*

"""