Spaces:

nuojohnchen
/

Kahneman4Review

Sleeping

App Files Files Community

nuocuhz Claude Sonnet 4.6 commited on Mar 10

Commit

552a059

1 Parent(s): 6b4c90f

Wire meta-review tab to rate_metareview(), clean up duplicate code in app.py

Browse files

Files changed (3) hide show

.gitattributes +1 -0
app.py +45 -251
rater.py +285 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -5,7 +5,10 @@ import json
 import datetime
 import gradio as gr
 from fetcher import fetch_paper_reviews, get_bundled_ids
-from rater import rate_review, format_result_markdown
 _paper_cache: dict = {}
 _last_result: dict = {}   # stores last single-reviewer rating for feedback
@@ -184,6 +187,32 @@ def run_rating_all(paper_id: str, api_key: str):
     yield accumulated + "\n\n---\n\n*Done.*", gr.update(visible=False)
 def submit_feedback(satisfaction: str, correct_label: str, comment: str):
     if not _last_result:
         return "No rating to give feedback on yet."
@@ -200,7 +229,7 @@ def submit_feedback(satisfaction: str, correct_label: str, comment: str):
     }
     with open(FEEDBACK_FILE, "a") as f:
         f.write(json.dumps(entry, ensure_ascii=False) + "\n")
-    return f"✅ Feedback saved. Thank you!"
 # ── UI ──────────────────────────────────────────────────────────────────────────
@@ -296,9 +325,17 @@ This perspective reframes peer review as a **reasoning process** rather than mer
         with gr.Tab("Meta-Review"):
             meta_display = gr.Markdown("*Load a paper to see the meta-review.*")
     # ── Wire events ────────────────────────────────────────────────────────────
-    load_btn.click(load_paper, [paper_id_box], [reviewer_dd, paper_info, meta_display, result_display])
     reviewer_dd.change(show_review, [paper_id_box, reviewer_dd], [review_display])
     rate_one_btn.click(
@@ -311,6 +348,11 @@ This perspective reframes peer review as a **reasoning process** rather than mer
         [paper_id_box, api_key_box],
         [result_display, feedback_panel],
     )
     submit_fb_btn.click(
         submit_feedback,
         [satisfaction, correct_label, comment],
@@ -320,251 +362,3 @@ This perspective reframes peer review as a **reasoning process** rather than mer
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
-_paper_cache: dict = {}
-# ── Section content ────────────────────────────────────────────────────────────
-SECTION_CONTENT = {
-    "📖 Motivation": """### Motivation
-Peer review is one of the central institutions governing scientific progress, yet most existing analysis focuses on outcomes such as scores, acceptance rates, disagreement levels, or textual sentiment. These signals are useful but incomplete. They do not directly capture **how reviewers think**.
-Kahneman's dual-process framework provides a principled theoretical lens:
-- **System 1** is rapid, associative, intuitive, and often relies on heuristics such as representativeness, familiarity, fluency, and global impressions.
-- **System 2** is effortful, analytical, explicit, and more likely to engage in structured reasoning, evidence integration, and conditional judgment.
-Applied to peer review, this distinction enables us to study whether a review is dominated by venue-fit heuristics, abstract "overall impression" judgments, or conclusion-first reasoning — or instead by falsifiable claims, methodological decomposition, comparative evidence, and belief updating.
-This is not merely a stylistic distinction. It bears directly on questions of **review quality**, **rebuttal responsiveness**, **decision transparency**, and **cognitive bias in evaluation**.""",
-    "🎯 Core Objectives": """### Core Objectives
-The goal of Kahneman4Review is to build a robust framework for:
-1. **Classifying** review text into cognitive reasoning modes inspired by Kahneman's theory;
-2. **Characterizing** the effort structure of review reasoning, from low-effort impressionistic judgment to high-effort analytical synthesis;
-3. **Diagnosing** cognitive biases in review and metareview, such as representativeness heuristics, question substitution, anchoring, confirmation bias, overconfidence, and narrative fallacy;
-4. **Supporting** LLM-based judges that can assess the reasoning mode and epistemic quality of reviews in a structured, reproducible way.""",
-    "📐 Academic Claim": """### Academic Claim
-The central academic claim is that **review quality cannot be fully understood without reasoning structure**. A review may be long, harsh, polite, or even technically correct, yet still be cognitively shallow. Conversely, a review may be negative but high-quality if it exhibits strong System 2 properties such as precise falsifiability, explicit evidence chains, and principled updating under rebuttal.
-This project sits at the intersection of:
-- **Metascience**: understanding the scientific process itself;
-- **AI for Science / AI for Institutions**: using language models to analyze scientific governance mechanisms;
-- **Computational social science**: studying evaluation behavior through text;
-- **LLM-as-a-Judge research**: moving beyond outcome scoring toward reasoning-aware judgment;
-- **Cognitive science of decision-making**: operationalizing dual-process theory in institutional text.""",
-    "🔑 Key Contributions": """### Key Contributions
-**1. A cognitive taxonomy for peer review**
-We operationalize Kahneman's theory into an annotation framework suitable for review text, including System 1, System 2, mixed / transitional reasoning, and non-evaluative administrative language.
-**2. Effort-sensitive reasoning analysis**
-Beyond binary labels, the framework distinguishes different levels of System 2 effort, separating shallow structured criticism from deeper falsification-oriented reasoning and meta-level synthesis.
-**3. Bias diagnostics for review interpretation**
-The framework explicitly identifies recurring bias pathways: venue-fit substitution, authority alignment, conclusion-first justification, selective evidence weighting, and failure to update after rebuttal.""",
-    "💡 Why This Matters": """### Why This Matters
-The significance of this project is not limited to review analytics. More broadly, it addresses a foundational problem in the evaluation of human and AI reasoning:
-> *How can we distinguish genuine analysis from articulate intuition?*
-In academic review, this distinction affects fairness, transparency, and the reliability of scientific gatekeeping. In LLM evaluation, it affects whether models merely mimic analytical language or actually detect structured reasoning.
-By making the cognitive mode of review explicit, Kahneman4Review aims to support better review auditing, more interpretable LLM judges, stronger rebuttal strategies, and more scientifically grounded discussion of what constitutes a "good review." """,
-}
-SECTION_LABELS = list(SECTION_CONTENT.keys())
-# ── Callbacks ──────────────────────────────────────────────────────────────────
-def _get_api_key(user_key: str) -> str:
-    k = (user_key or "").strip()
-    return k or os.environ.get("ANTHROPIC_API_KEY", "")
-def toggle_section(label, current_label):
-    """Toggle section: if same button clicked again, collapse."""
-    if label == current_label:
-        return "", gr.update(visible=False), ""
-    return SECTION_CONTENT.get(label, ""), gr.update(visible=True), label
-def load_paper(paper_id: str):
-    paper_id = (paper_id or "").strip()
-    if not paper_id:
-        return gr.update(choices=[], value=None), "Please enter a paper ID.", "", ""
-    try:
-        paper = fetch_paper_reviews(paper_id)
-        _paper_cache[paper_id] = paper
-        reviewers = [r["reviewer_id"] for r in paper["reviews"]]
-        decision = paper.get("decision", "")
-        info = f"**{paper.get('title', paper_id)}**\n\n{paper.get('conference', '')}"
-        if decision:
-            info += f"  ·  **Decision:** {decision}"
-        info += f"  ·  {len(reviewers)} reviewer(s)"
-        metareview = paper.get("metareview", "")
-        meta_md = f"**Area Chair Meta-Review:**\n\n{metareview}" if metareview else "*No meta-review available.*"
-        return gr.update(choices=reviewers, value=reviewers[0] if reviewers else None), info, meta_md, ""
-    except Exception as e:
-        return gr.update(choices=[], value=None), f"Error: {e}", "", ""
-def show_review(paper_id: str, reviewer_id: str):
-    paper = _paper_cache.get((paper_id or "").strip())
-    if not paper or not reviewer_id:
-        return ""
-    for r in paper["reviews"]:
-        if r["reviewer_id"] == reviewer_id:
-            return f"**Initial:** {r['initial_rating']}  **Final:** {r['final_rating']}\n\n{r['review_content']}"
-    return ""
-def run_rating(paper_id: str, reviewer_id: str, api_key: str):
-    paper = _paper_cache.get((paper_id or "").strip())
-    if not paper:
-        yield "Please load a paper first."
-        return
-    if not reviewer_id:
-        yield "Please select a reviewer."
-        return
-    key = _get_api_key(api_key)
-    if not key:
-        yield "No API key found. Enter your Anthropic API key above."
-        return
-    review = next((r for r in paper["reviews"] if r["reviewer_id"] == reviewer_id), None)
-    if not review:
-        yield f"Reviewer {reviewer_id} not found."
-        return
-    yield f"Calling Claude to rate **{reviewer_id}**…"
-    try:
-        result = rate_review(
-            review_content=review["review_content"],
-            initial_rating=review["initial_rating"],
-            final_rating=review["final_rating"],
-            conference=paper.get("conference", ""),
-            api_key=key,
-        )
-        yield format_result_markdown(reviewer_id, result)
-    except Exception as e:
-        yield f"Error: {e}"
-def run_rating_all(paper_id: str, api_key: str):
-    paper = _paper_cache.get((paper_id or "").strip())
-    if not paper:
-        yield "Please load a paper first."
-        return
-    key = _get_api_key(api_key)
-    if not key:
-        yield "No API key found. Enter your Anthropic API key above."
-        return
-    accumulated = ""
-    for i, review in enumerate(paper["reviews"]):
-        rid = review["reviewer_id"]
-        marker = f"\n\n---\n\n*Rating {i+1}/{len(paper['reviews'])}: {rid}…*"
-        accumulated += marker
-        yield accumulated
-        try:
-            result = rate_review(
-                review_content=review["review_content"],
-                initial_rating=review["initial_rating"],
-                final_rating=review["final_rating"],
-                conference=paper.get("conference", ""),
-                api_key=key,
-            )
-            accumulated = accumulated[: -len(marker)]
-            accumulated += "\n\n---\n\n" + format_result_markdown(rid, result)
-        except Exception as e:
-            accumulated = accumulated[: -len(marker)]
-            accumulated += f"\n\n---\n\n**{rid}** — Error: {e}"
-        yield accumulated
-    yield accumulated + "\n\n---\n\n*Done.*"
-# ── UI ─────────────────────────────────────────────────────────────────────────
-with gr.Blocks(title="Kahneman4Review", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""# 🧠 Kahneman4Review
-Kahneman4Review is a research-oriented framework for analyzing the cognitive structure of academic peer review through the lens of Daniel Kahneman's dual-process theory in *Thinking, Fast and Slow*. The project studies whether review statements are primarily driven by **System 1** reasoning (fast, intuitive, impression-based judgment) or by **System 2** reasoning (slow, deliberate, evidence-based analysis).
-Rather than treating reviews only as scalar signals of acceptance or rejection, this project asks a deeper scientific question:
-> *What kinds of cognition are reflected in peer review text, and how do those cognitive modes shape review quality, fairness, and decision reliability?*
-This perspective reframes peer review as a **reasoning process** rather than merely an evaluative outcome.
-""")
-    # ── Shared expandable section ──────────────────────────────────────────────
-    _current_section = gr.State("")
-    with gr.Row():
-        sec_btns = [
-            gr.Button(label, size="sm", variant="secondary")
-            for label in SECTION_LABELS
-        ]
-    section_box = gr.Markdown("", visible=False)
-    for btn in sec_btns:
-        btn.click(
-            fn=toggle_section,
-            inputs=[btn, _current_section],
-            outputs=[section_box, section_box, _current_section],
-        )
-    gr.Markdown("""---
-> *"A review should be judged not only by what it concludes, but by how it reaches that conclusion."*
----""")
-    # ── Paper loader ───────────────────────────────────────────────────────────
-    api_key_box = gr.Textbox(
-        label="Anthropic API Key (leave blank to use server key)",
-        placeholder="sk-ant-...",
-        type="password",
-    )
-    with gr.Row():
-        paper_id_box = gr.Textbox(
-            label="OpenReview Paper ID",
-            placeholder="e.g. B1e3OlStPB",
-            scale=3,
-        )
-        load_btn = gr.Button("Load Paper", variant="primary", scale=1)
-    paper_info = gr.Markdown("")
-    with gr.Tabs():
-        with gr.Tab("Reviews"):
-            with gr.Row():
-                reviewer_dd = gr.Dropdown(choices=[], label="Select Reviewer", interactive=True, scale=2)
-                rate_one_btn = gr.Button("AI Rate This Reviewer", variant="primary", scale=1)
-                rate_all_btn = gr.Button("AI Rate All Reviewers", variant="secondary", scale=1)
-            review_display = gr.Markdown("")
-            gr.Markdown("---")
-            gr.Markdown("### Rating Results")
-            result_display = gr.Markdown("")
-        with gr.Tab("Meta-Review"):
-            meta_display = gr.Markdown("*Load a paper to see the meta-review.*")
-    load_btn.click(load_paper, [paper_id_box], [reviewer_dd, paper_info, meta_display, result_display])
-    reviewer_dd.change(show_review, [paper_id_box, reviewer_dd], [review_display])
-    rate_one_btn.click(run_rating, [paper_id_box, reviewer_dd, api_key_box], [result_display])
-    rate_all_btn.click(run_rating_all, [paper_id_box, api_key_box], [result_display])
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)

 import datetime
 import gradio as gr
 from fetcher import fetch_paper_reviews, get_bundled_ids
+from rater import (
+    rate_review, format_result_markdown,
+    rate_metareview, format_metareview_result_markdown,
+)
 _paper_cache: dict = {}
 _last_result: dict = {}   # stores last single-reviewer rating for feedback
     yield accumulated + "\n\n---\n\n*Done.*", gr.update(visible=False)
+def run_metareview_rating(paper_id: str, api_key: str):
+    paper = _paper_cache.get((paper_id or "").strip())
+    if not paper:
+        yield "Please load a paper first."
+        return
+    metareview = paper.get("metareview", "").strip()
+    if not metareview:
+        yield "No meta-review available for this paper."
+        return
+    key = _get_api_key(api_key)
+    if not key:
+        yield "No API key found. Enter your Anthropic API key above."
+        return
+    yield "Calling Claude to rate the meta-review…"
+    try:
+        result = rate_metareview(
+            metareview_content=metareview,
+            decision=paper.get("decision", ""),
+            conference=paper.get("conference", ""),
+            api_key=key,
+        )
+        yield format_metareview_result_markdown(result)
+    except Exception as e:
+        yield f"Error: {e}"
 def submit_feedback(satisfaction: str, correct_label: str, comment: str):
     if not _last_result:
         return "No rating to give feedback on yet."
     }
     with open(FEEDBACK_FILE, "a") as f:
         f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+    return "✅ Feedback saved. Thank you!"
 # ── UI ──────────────────────────────────────────────────────────────────────────
         with gr.Tab("Meta-Review"):
             meta_display = gr.Markdown("*Load a paper to see the meta-review.*")
+            gr.Markdown("---")
+            rate_meta_btn = gr.Button("AI Rate Meta-Review", variant="primary")
+            gr.Markdown("### Meta-Review Analysis")
+            meta_result_display = gr.Markdown("")
     # ── Wire events ────────────────────────────────────────────────────────────
+    load_btn.click(
+        load_paper,
+        [paper_id_box],
+        [reviewer_dd, paper_info, meta_display, result_display],
+    )
     reviewer_dd.change(show_review, [paper_id_box, reviewer_dd], [review_display])
     rate_one_btn.click(
         [paper_id_box, api_key_box],
         [result_display, feedback_panel],
     )
+    rate_meta_btn.click(
+        run_metareview_rating,
+        [paper_id_box, api_key_box],
+        [meta_result_display],
+    )
     submit_fb_btn.click(
         submit_feedback,
         [satisfaction, correct_label, comment],
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)

rater.py CHANGED Viewed

@@ -259,6 +259,291 @@ def rate_review(review_content: str, initial_rating: str, final_rating: str,
     return result
 def format_result_markdown(reviewer_id: str, result: dict) -> str:
     label = result.get("main_label", "?")
     icon = LABEL_COLORS.get(label, "⚫")

     return result
+# ══════════════════════════════════════════════════════════════════════════════
+# META-REVIEW PROMPT
+# ══════════════════════════════════════════════════════════════════════════════
+META_SYSTEM_PROMPT = """You are an expert evaluator of reasoning in academic meta-review.
+Your task is to classify a given meta-review passage according to Daniel Kahneman's dual-process framework from *Thinking, Fast and Slow*, adapted to the context of academic review aggregation and decision-making.
+A meta-review is NOT the same as an ordinary review.
+In addition to object-level reasoning, a meta-review may perform higher-order reasoning by:
+- aggregating reviewer evidence,
+- handling disagreement,
+- recording rebuttal-driven updates,
+- synthesizing concerns into a final decision,
+- and making an institutional recommendation traceable to prior evidence.
+You are NOT judging whether the final decision is correct.
+You are judging what kind of cognitive process the meta-review reflects, how much analytical effort it shows, how well it aggregates evidence, and how transparent its final decision is.
+==================================================
+SECTION 1: LABEL SPACE
+==================================================
+Assign exactly one main label:
+1. System 1
+2. System 2
+3. Mixed
+4. Non-evaluative
+Definitions:
+System 1:
+The meta-review is primarily intuitive, compressed, shortcut-driven, authority-dependent, venue-fit driven, or conclusion-first.
+It may summarize concerns vaguely, rely on global impressions, defer to others without real synthesis, or make a final decision that is weakly traceable to evidence.
+System 2:
+The meta-review performs explicit, careful, evidence-based synthesis.
+It accurately aggregates reviewer concerns, distinguishes resolved vs unresolved issues, handles disagreement explicitly, reflects rebuttal-driven updates, and makes the final decision traceable to the preceding evidence.
+Mixed:
+The meta-review contains substantial signals of both System 1 and System 2.
+For example, it may contain real synthesis but still end with a compressed venue-fit judgment, or summarize reviewer evidence carefully but ignore disagreement.
+Non-evaluative:
+The meta-review contains little or no meaningful evaluative reasoning.
+Examples include purely administrative text, placeholder content, or a decision with no supporting reasoning.
+==================================================
+SECTION 2: CORE JUDGING PRINCIPLES
+==================================================
+1. Do NOT judge based on decision correctness.
+A meta-review that recommends rejection can still be System 2 if its reasoning is explicit and traceable.
+2. Do NOT judge based on length.
+A short meta-review can be System 2 if it is precise and well-grounded.
+A long meta-review can be System 1 if it is repetitive or impressionistic.
+3. DO judge based on aggregation quality.
+Does the meta-reviewer accurately represent what the reviewers said?
+Does it distinguish between reviewers who were convinced by the rebuttal and those who were not?
+4. DO judge based on decision traceability.
+Can the final recommendation be traced back to specific evidence or reasoning in the meta-review?
+Or does it appear as a conclusion without derivation?
+5. DO judge based on disagreement handling.
+When reviewers disagree, does the meta-reviewer engage with the disagreement explicitly?
+Or does it average, ignore, or defer to the majority without reasoning?
+6. DO judge based on rebuttal integration.
+Does the meta-review reflect what changed (or did not change) after the rebuttal?
+Or does it treat the rebuttal as irrelevant?
+==================================================
+SECTION 3: DIMENSION SCORES
+==================================================
+Score the meta-review on 6 dimensions (1–5 each):
+D1. Aggregation Accuracy
+Does the meta-reviewer accurately represent the reviewers' concerns?
+1 = Misrepresents or ignores reviewer content
+3 = Partially accurate, some omissions or distortions
+5 = Accurate and complete representation of reviewer positions
+D2. Disagreement Handling
+When reviewers disagree, does the meta-reviewer engage with the disagreement?
+1 = Ignores disagreement or averages without reasoning
+3 = Acknowledges disagreement but does not resolve it
+5 = Explicitly engages with disagreement and explains how it was resolved
+D3. Rebuttal Integration
+Does the meta-review reflect what changed after the rebuttal?
+1 = No mention of rebuttal or its effects
+3 = Mentions rebuttal but does not specify what changed
+5 = Explicitly states which concerns were resolved and which remain
+D4. Decision Traceability
+Can the final recommendation be traced to specific evidence?
+1 = Decision appears without derivation
+3 = Some connection between evidence and decision
+5 = Decision is fully traceable to specific prior reasoning
+D5. Reasoning Explicitness
+Does the meta-reviewer show their reasoning, or only state conclusions?
+1 = Pure assertion
+3 = Some reasoning present but incomplete
+5 = Fully explicit reasoning chains
+D6. Synthesis Quality
+Does the meta-review go beyond summarizing individual reviews to produce a coherent synthesis?
+1 = No synthesis, just a list of reviewer opinions
+3 = Some synthesis present
+5 = Coherent synthesis that integrates multiple perspectives into a unified assessment
+==================================================
+SECTION 4: BIAS FLAGS
+==================================================
+Flag any of the following if clearly evidenced:
+- MAJORITY_DEFERENCE: Decision follows reviewer majority without independent reasoning
+- AUTHORITY_DEFERENCE: Defers to a senior reviewer or author reputation without analysis
+- VENUE_FIT: Decision based on fit to venue rather than paper quality
+- REBUTTAL_DISMISSAL: Rebuttal ignored or dismissed without engagement
+- AGGREGATION_COMPRESSION: Reviewer concerns compressed into vague summary losing specificity
+- CONCLUSION_FIRST: Final decision stated before or without supporting reasoning
+- SELECTIVE_SYNTHESIS: Only engages with evidence supporting the final decision
+- OVERCONFIDENCE: Certainty expressed beyond what the evidence supports
+==================================================
+SECTION 5: REASONING QUALITY SCORE
+==================================================
+Assign a single overall Reasoning Quality Score from 1 to 10.
+1–2: No meaningful reasoning. Pure assertion or administrative text.
+3–4: Minimal reasoning. Mostly impressionistic with occasional specifics.
+5–6: Moderate reasoning. Some structured analysis but significant gaps.
+7–8: Strong reasoning. Mostly explicit, grounded, and well-aggregated.
+9–10: Exceptional reasoning. Fully explicit, traceable, and systematically structured.
+==================================================
+OUTPUT FORMAT
+==================================================
+Respond with ONLY a valid JSON object. No markdown fences, no explanation outside the JSON.
+{
+  "main_label": "<System 1 | System 2 | Mixed | Non-evaluative>",
+  "label_confidence": "<high | medium | low>",
+  "system1_score": <0.0-1.0>,
+  "system2_score": <0.0-1.0>,
+  "reasoning_quality_score": <1-10>,
+  "dimension_scores": {
+    "aggregation_accuracy": <1-5>,
+    "disagreement_handling": <1-5>,
+    "rebuttal_integration": <1-5>,
+    "decision_traceability": <1-5>,
+    "reasoning_explicitness": <1-5>,
+    "synthesis_quality": <1-5>
+  },
+  "bias_flags": ["<BIAS_NAME>", ...],
+  "key_system1_signals": ["<signal>", ...],
+  "key_system2_signals": ["<signal>", ...],
+  "most_diagnostic_quote": "<exact quote from meta-review>",
+  "brief_rationale": "<2-3 sentences explaining the classification>"
+}
+Notes on system1_score and system2_score:
+- Both are continuous values in [0.0, 1.0].
+- They do NOT need to sum to 1.0.
+- System 1 dominant: system1_score > 0.7, system2_score < 0.3
+- System 2 dominant: system2_score > 0.7, system1_score < 0.3
+- Mixed: both moderate (0.3–0.7)
+- Non-evaluative: both low (<0.3)"""
+META_DIMENSION_LABELS = {
+    "aggregation_accuracy":    "Aggregation Accuracy",
+    "disagreement_handling":   "Disagreement Handling",
+    "rebuttal_integration":    "Rebuttal Integration",
+    "decision_traceability":   "Decision Traceability",
+    "reasoning_explicitness":  "Reasoning Explicitness",
+    "synthesis_quality":       "Synthesis Quality",
+}
+META_WEIGHTS = {
+    "aggregation_accuracy":   0.20,
+    "disagreement_handling":  0.15,
+    "rebuttal_integration":   0.15,
+    "decision_traceability":  0.20,
+    "reasoning_explicitness": 0.15,
+    "synthesis_quality":      0.15,
+}
+def rate_metareview(metareview_content: str, decision: str,
+                    conference: str, api_key: str) -> dict:
+    client = anthropic.Anthropic(api_key=api_key)
+    prompt = f"""Conference: {conference}
+Decision: {decision}
+Meta-review text:
+{metareview_content[:4000]}"""
+    msg = client.messages.create(
+        model="claude-sonnet-4-6",
+        max_tokens=1400,
+        messages=[{"role": "user", "content": META_SYSTEM_PROMPT + "\n\n" + prompt}],
+    )
+    raw = msg.content[0].text
+    result = _parse_json(raw)
+    result["composite_score"] = round(
+        sum(META_WEIGHTS[k] * result.get("dimension_scores", {}).get(k, 3)
+            for k in META_WEIGHTS), 2
+    )
+    result["derived"] = compute_derived_metrics(result)
+    result["_raw_response"] = raw
+    return result
+def format_metareview_result_markdown(result: dict) -> str:
+    label = result.get("main_label", "?")
+    icon = LABEL_COLORS.get(label, "⚫")
+    rqs = result.get("reasoning_quality_score", "?")
+    composite = result.get("composite_score", "?")
+    confidence = result.get("label_confidence", "?")
+    rationale = result.get("brief_rationale", "")
+    quote = result.get("most_diagnostic_quote", "")
+    bias_flags = result.get("bias_flags", [])
+    s1_signals = result.get("key_system1_signals", [])
+    s2_signals = result.get("key_system2_signals", [])
+    ds = result.get("dimension_scores", {})
+    derived = result.get("derived", {})
+    s1 = result.get("system1_score", "?")
+    s2 = result.get("system2_score", "?")
+    hd = derived.get("heuristic_dominance", "?")
+    ast = derived.get("analytic_strength", "?")
+    mix = derived.get("mixedness", "?")
+    filled = int(round((composite - 1) / 4 * 20)) if isinstance(composite, (int, float)) else 0
+    bar = "█" * filled + "░" * (20 - filled)
+    lines = [
+        "### Area Chair Meta-Review Analysis",
+        f"**Classification:** {icon} **{label}**  ·  Confidence: {confidence}",
+        f"**Reasoning Quality Score:** {rqs} / 10",
+        f"**Dimension Composite:** `{bar}` {composite} / 5.00",
+        "",
+        "| Dimension | Score |",
+        "|-----------|------:|",
+    ]
+    for key, label_str in META_DIMENSION_LABELS.items():
+        score = ds.get(key, "?")
+        lines.append(f"| {label_str} | {score}/5 |")
+    lines += [
+        "",
+        "**Derived Metrics:**",
+        "| Metric | Value | Interpretation |",
+        "|--------|------:|----------------|",
+        f"| System 1 Score | {s1} | Degree of heuristic/intuitive reasoning (0–1) |",
+        f"| System 2 Score | {s2} | Degree of analytical/deliberate reasoning (0–1) |",
+        f"| Heuristic Dominance | {hd} | S1 − S2 · positive = more System 1 |",
+        f"| Analytic Strength | {ast} | RQS / 10 · overall reasoning quality |",
+        f"| Mixedness | {mix} | min(S1, S2) · how much both modes coexist |",
+    ]
+    if bias_flags:
+        lines += ["", "**Bias Flags:** " + "  ".join(f"`{b}`" for b in bias_flags)]
+    if s2_signals:
+        lines += ["", "**System 2 signals:** " + " · ".join(f"*{s}*" for s in s2_signals)]
+    if s1_signals:
+        lines += ["", "**System 1 signals:** " + " · ".join(f"*{s}*" for s in s1_signals)]
+    if quote:
+        lines += ["", f"**Most diagnostic quote:** *\"{quote}\"*"]
+    if rationale:
+        lines += ["", f"**Rationale:** {rationale}"]
+    return "\n".join(lines)
 def format_result_markdown(reviewer_id: str, result: dict) -> str:
     label = result.get("main_label", "?")
     icon = LABEL_COLORS.get(label, "⚫")