nuocuhz Claude Sonnet 4.6 commited on
Commit
552a059
·
1 Parent(s): 6b4c90f

Wire meta-review tab to rate_metareview(), clean up duplicate code in app.py

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +45 -251
  3. rater.py +285 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -5,7 +5,10 @@ import json
5
  import datetime
6
  import gradio as gr
7
  from fetcher import fetch_paper_reviews, get_bundled_ids
8
- from rater import rate_review, format_result_markdown
 
 
 
9
 
10
  _paper_cache: dict = {}
11
  _last_result: dict = {} # stores last single-reviewer rating for feedback
@@ -184,6 +187,32 @@ def run_rating_all(paper_id: str, api_key: str):
184
  yield accumulated + "\n\n---\n\n*Done.*", gr.update(visible=False)
185
 
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  def submit_feedback(satisfaction: str, correct_label: str, comment: str):
188
  if not _last_result:
189
  return "No rating to give feedback on yet."
@@ -200,7 +229,7 @@ def submit_feedback(satisfaction: str, correct_label: str, comment: str):
200
  }
201
  with open(FEEDBACK_FILE, "a") as f:
202
  f.write(json.dumps(entry, ensure_ascii=False) + "\n")
203
- return f"✅ Feedback saved. Thank you!"
204
 
205
 
206
  # ── UI ──────────────────────────────────────────────────────────────────────────
@@ -296,9 +325,17 @@ This perspective reframes peer review as a **reasoning process** rather than mer
296
 
297
  with gr.Tab("Meta-Review"):
298
  meta_display = gr.Markdown("*Load a paper to see the meta-review.*")
 
 
 
 
299
 
300
  # ── Wire events ────────────────────────────────────────────────────────────
301
- load_btn.click(load_paper, [paper_id_box], [reviewer_dd, paper_info, meta_display, result_display])
 
 
 
 
302
  reviewer_dd.change(show_review, [paper_id_box, reviewer_dd], [review_display])
303
 
304
  rate_one_btn.click(
@@ -311,6 +348,11 @@ This perspective reframes peer review as a **reasoning process** rather than mer
311
  [paper_id_box, api_key_box],
312
  [result_display, feedback_panel],
313
  )
 
 
 
 
 
314
  submit_fb_btn.click(
315
  submit_feedback,
316
  [satisfaction, correct_label, comment],
@@ -320,251 +362,3 @@ This perspective reframes peer review as a **reasoning process** rather than mer
320
 
321
  if __name__ == "__main__":
322
  demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
323
-
324
-
325
- _paper_cache: dict = {}
326
-
327
- # ── Section content ────────────────────────────────────────────────────────────
328
-
329
- SECTION_CONTENT = {
330
- "📖 Motivation": """### Motivation
331
-
332
- Peer review is one of the central institutions governing scientific progress, yet most existing analysis focuses on outcomes such as scores, acceptance rates, disagreement levels, or textual sentiment. These signals are useful but incomplete. They do not directly capture **how reviewers think**.
333
-
334
- Kahneman's dual-process framework provides a principled theoretical lens:
335
-
336
- - **System 1** is rapid, associative, intuitive, and often relies on heuristics such as representativeness, familiarity, fluency, and global impressions.
337
- - **System 2** is effortful, analytical, explicit, and more likely to engage in structured reasoning, evidence integration, and conditional judgment.
338
-
339
- Applied to peer review, this distinction enables us to study whether a review is dominated by venue-fit heuristics, abstract "overall impression" judgments, or conclusion-first reasoning — or instead by falsifiable claims, methodological decomposition, comparative evidence, and belief updating.
340
-
341
- This is not merely a stylistic distinction. It bears directly on questions of **review quality**, **rebuttal responsiveness**, **decision transparency**, and **cognitive bias in evaluation**.""",
342
-
343
- "🎯 Core Objectives": """### Core Objectives
344
-
345
- The goal of Kahneman4Review is to build a robust framework for:
346
-
347
- 1. **Classifying** review text into cognitive reasoning modes inspired by Kahneman's theory;
348
- 2. **Characterizing** the effort structure of review reasoning, from low-effort impressionistic judgment to high-effort analytical synthesis;
349
- 3. **Diagnosing** cognitive biases in review and metareview, such as representativeness heuristics, question substitution, anchoring, confirmation bias, overconfidence, and narrative fallacy;
350
- 4. **Supporting** LLM-based judges that can assess the reasoning mode and epistemic quality of reviews in a structured, reproducible way.""",
351
-
352
- "📐 Academic Claim": """### Academic Claim
353
-
354
- The central academic claim is that **review quality cannot be fully understood without reasoning structure**. A review may be long, harsh, polite, or even technically correct, yet still be cognitively shallow. Conversely, a review may be negative but high-quality if it exhibits strong System 2 properties such as precise falsifiability, explicit evidence chains, and principled updating under rebuttal.
355
-
356
- This project sits at the intersection of:
357
- - **Metascience**: understanding the scientific process itself;
358
- - **AI for Science / AI for Institutions**: using language models to analyze scientific governance mechanisms;
359
- - **Computational social science**: studying evaluation behavior through text;
360
- - **LLM-as-a-Judge research**: moving beyond outcome scoring toward reasoning-aware judgment;
361
- - **Cognitive science of decision-making**: operationalizing dual-process theory in institutional text.""",
362
-
363
- "🔑 Key Contributions": """### Key Contributions
364
-
365
- **1. A cognitive taxonomy for peer review**
366
- We operationalize Kahneman's theory into an annotation framework suitable for review text, including System 1, System 2, mixed / transitional reasoning, and non-evaluative administrative language.
367
-
368
- **2. Effort-sensitive reasoning analysis**
369
- Beyond binary labels, the framework distinguishes different levels of System 2 effort, separating shallow structured criticism from deeper falsification-oriented reasoning and meta-level synthesis.
370
-
371
- **3. Bias diagnostics for review interpretation**
372
- The framework explicitly identifies recurring bias pathways: venue-fit substitution, authority alignment, conclusion-first justification, selective evidence weighting, and failure to update after rebuttal.""",
373
-
374
- "💡 Why This Matters": """### Why This Matters
375
-
376
- The significance of this project is not limited to review analytics. More broadly, it addresses a foundational problem in the evaluation of human and AI reasoning:
377
-
378
- > *How can we distinguish genuine analysis from articulate intuition?*
379
-
380
- In academic review, this distinction affects fairness, transparency, and the reliability of scientific gatekeeping. In LLM evaluation, it affects whether models merely mimic analytical language or actually detect structured reasoning.
381
-
382
- By making the cognitive mode of review explicit, Kahneman4Review aims to support better review auditing, more interpretable LLM judges, stronger rebuttal strategies, and more scientifically grounded discussion of what constitutes a "good review." """,
383
- }
384
-
385
- SECTION_LABELS = list(SECTION_CONTENT.keys())
386
-
387
-
388
- # ── Callbacks ──────────────────────────────────────────────────────────────────
389
-
390
- def _get_api_key(user_key: str) -> str:
391
- k = (user_key or "").strip()
392
- return k or os.environ.get("ANTHROPIC_API_KEY", "")
393
-
394
-
395
- def toggle_section(label, current_label):
396
- """Toggle section: if same button clicked again, collapse."""
397
- if label == current_label:
398
- return "", gr.update(visible=False), ""
399
- return SECTION_CONTENT.get(label, ""), gr.update(visible=True), label
400
-
401
-
402
- def load_paper(paper_id: str):
403
- paper_id = (paper_id or "").strip()
404
- if not paper_id:
405
- return gr.update(choices=[], value=None), "Please enter a paper ID.", "", ""
406
- try:
407
- paper = fetch_paper_reviews(paper_id)
408
- _paper_cache[paper_id] = paper
409
- reviewers = [r["reviewer_id"] for r in paper["reviews"]]
410
- decision = paper.get("decision", "")
411
- info = f"**{paper.get('title', paper_id)}**\n\n{paper.get('conference', '')}"
412
- if decision:
413
- info += f" · **Decision:** {decision}"
414
- info += f" · {len(reviewers)} reviewer(s)"
415
- metareview = paper.get("metareview", "")
416
- meta_md = f"**Area Chair Meta-Review:**\n\n{metareview}" if metareview else "*No meta-review available.*"
417
- return gr.update(choices=reviewers, value=reviewers[0] if reviewers else None), info, meta_md, ""
418
- except Exception as e:
419
- return gr.update(choices=[], value=None), f"Error: {e}", "", ""
420
-
421
-
422
- def show_review(paper_id: str, reviewer_id: str):
423
- paper = _paper_cache.get((paper_id or "").strip())
424
- if not paper or not reviewer_id:
425
- return ""
426
- for r in paper["reviews"]:
427
- if r["reviewer_id"] == reviewer_id:
428
- return f"**Initial:** {r['initial_rating']} **Final:** {r['final_rating']}\n\n{r['review_content']}"
429
- return ""
430
-
431
-
432
- def run_rating(paper_id: str, reviewer_id: str, api_key: str):
433
- paper = _paper_cache.get((paper_id or "").strip())
434
- if not paper:
435
- yield "Please load a paper first."
436
- return
437
- if not reviewer_id:
438
- yield "Please select a reviewer."
439
- return
440
- key = _get_api_key(api_key)
441
- if not key:
442
- yield "No API key found. Enter your Anthropic API key above."
443
- return
444
- review = next((r for r in paper["reviews"] if r["reviewer_id"] == reviewer_id), None)
445
- if not review:
446
- yield f"Reviewer {reviewer_id} not found."
447
- return
448
- yield f"Calling Claude to rate **{reviewer_id}**…"
449
- try:
450
- result = rate_review(
451
- review_content=review["review_content"],
452
- initial_rating=review["initial_rating"],
453
- final_rating=review["final_rating"],
454
- conference=paper.get("conference", ""),
455
- api_key=key,
456
- )
457
- yield format_result_markdown(reviewer_id, result)
458
- except Exception as e:
459
- yield f"Error: {e}"
460
-
461
-
462
- def run_rating_all(paper_id: str, api_key: str):
463
- paper = _paper_cache.get((paper_id or "").strip())
464
- if not paper:
465
- yield "Please load a paper first."
466
- return
467
- key = _get_api_key(api_key)
468
- if not key:
469
- yield "No API key found. Enter your Anthropic API key above."
470
- return
471
- accumulated = ""
472
- for i, review in enumerate(paper["reviews"]):
473
- rid = review["reviewer_id"]
474
- marker = f"\n\n---\n\n*Rating {i+1}/{len(paper['reviews'])}: {rid}…*"
475
- accumulated += marker
476
- yield accumulated
477
- try:
478
- result = rate_review(
479
- review_content=review["review_content"],
480
- initial_rating=review["initial_rating"],
481
- final_rating=review["final_rating"],
482
- conference=paper.get("conference", ""),
483
- api_key=key,
484
- )
485
- accumulated = accumulated[: -len(marker)]
486
- accumulated += "\n\n---\n\n" + format_result_markdown(rid, result)
487
- except Exception as e:
488
- accumulated = accumulated[: -len(marker)]
489
- accumulated += f"\n\n---\n\n**{rid}** — Error: {e}"
490
- yield accumulated
491
- yield accumulated + "\n\n---\n\n*Done.*"
492
-
493
-
494
- # ── UI ─────────────────────────────────────────────────────────────────────────
495
-
496
- with gr.Blocks(title="Kahneman4Review", theme=gr.themes.Soft()) as demo:
497
-
498
- gr.Markdown("""# 🧠 Kahneman4Review
499
-
500
- Kahneman4Review is a research-oriented framework for analyzing the cognitive structure of academic peer review through the lens of Daniel Kahneman's dual-process theory in *Thinking, Fast and Slow*. The project studies whether review statements are primarily driven by **System 1** reasoning (fast, intuitive, impression-based judgment) or by **System 2** reasoning (slow, deliberate, evidence-based analysis).
501
-
502
- Rather than treating reviews only as scalar signals of acceptance or rejection, this project asks a deeper scientific question:
503
-
504
- > *What kinds of cognition are reflected in peer review text, and how do those cognitive modes shape review quality, fairness, and decision reliability?*
505
-
506
- This perspective reframes peer review as a **reasoning process** rather than merely an evaluative outcome.
507
- """)
508
-
509
- # ── Shared expandable section ──────────────────────────────────────────────
510
- _current_section = gr.State("")
511
-
512
- with gr.Row():
513
- sec_btns = [
514
- gr.Button(label, size="sm", variant="secondary")
515
- for label in SECTION_LABELS
516
- ]
517
-
518
- section_box = gr.Markdown("", visible=False)
519
-
520
- for btn in sec_btns:
521
- btn.click(
522
- fn=toggle_section,
523
- inputs=[btn, _current_section],
524
- outputs=[section_box, section_box, _current_section],
525
- )
526
-
527
- gr.Markdown("""---
528
- > *"A review should be judged not only by what it concludes, but by how it reaches that conclusion."*
529
-
530
- ---""")
531
-
532
- # ── Paper loader ───────────────────────────────────────────────────────────
533
- api_key_box = gr.Textbox(
534
- label="Anthropic API Key (leave blank to use server key)",
535
- placeholder="sk-ant-...",
536
- type="password",
537
- )
538
-
539
- with gr.Row():
540
- paper_id_box = gr.Textbox(
541
- label="OpenReview Paper ID",
542
- placeholder="e.g. B1e3OlStPB",
543
- scale=3,
544
- )
545
- load_btn = gr.Button("Load Paper", variant="primary", scale=1)
546
-
547
- paper_info = gr.Markdown("")
548
-
549
- with gr.Tabs():
550
- with gr.Tab("Reviews"):
551
- with gr.Row():
552
- reviewer_dd = gr.Dropdown(choices=[], label="Select Reviewer", interactive=True, scale=2)
553
- rate_one_btn = gr.Button("AI Rate This Reviewer", variant="primary", scale=1)
554
- rate_all_btn = gr.Button("AI Rate All Reviewers", variant="secondary", scale=1)
555
- review_display = gr.Markdown("")
556
- gr.Markdown("---")
557
- gr.Markdown("### Rating Results")
558
- result_display = gr.Markdown("")
559
-
560
- with gr.Tab("Meta-Review"):
561
- meta_display = gr.Markdown("*Load a paper to see the meta-review.*")
562
-
563
- load_btn.click(load_paper, [paper_id_box], [reviewer_dd, paper_info, meta_display, result_display])
564
- reviewer_dd.change(show_review, [paper_id_box, reviewer_dd], [review_display])
565
- rate_one_btn.click(run_rating, [paper_id_box, reviewer_dd, api_key_box], [result_display])
566
- rate_all_btn.click(run_rating_all, [paper_id_box, api_key_box], [result_display])
567
-
568
-
569
- if __name__ == "__main__":
570
- demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
 
5
  import datetime
6
  import gradio as gr
7
  from fetcher import fetch_paper_reviews, get_bundled_ids
8
+ from rater import (
9
+ rate_review, format_result_markdown,
10
+ rate_metareview, format_metareview_result_markdown,
11
+ )
12
 
13
  _paper_cache: dict = {}
14
  _last_result: dict = {} # stores last single-reviewer rating for feedback
 
187
  yield accumulated + "\n\n---\n\n*Done.*", gr.update(visible=False)
188
 
189
 
190
+ def run_metareview_rating(paper_id: str, api_key: str):
191
+ paper = _paper_cache.get((paper_id or "").strip())
192
+ if not paper:
193
+ yield "Please load a paper first."
194
+ return
195
+ metareview = paper.get("metareview", "").strip()
196
+ if not metareview:
197
+ yield "No meta-review available for this paper."
198
+ return
199
+ key = _get_api_key(api_key)
200
+ if not key:
201
+ yield "No API key found. Enter your Anthropic API key above."
202
+ return
203
+ yield "Calling Claude to rate the meta-review…"
204
+ try:
205
+ result = rate_metareview(
206
+ metareview_content=metareview,
207
+ decision=paper.get("decision", ""),
208
+ conference=paper.get("conference", ""),
209
+ api_key=key,
210
+ )
211
+ yield format_metareview_result_markdown(result)
212
+ except Exception as e:
213
+ yield f"Error: {e}"
214
+
215
+
216
  def submit_feedback(satisfaction: str, correct_label: str, comment: str):
217
  if not _last_result:
218
  return "No rating to give feedback on yet."
 
229
  }
230
  with open(FEEDBACK_FILE, "a") as f:
231
  f.write(json.dumps(entry, ensure_ascii=False) + "\n")
232
+ return "✅ Feedback saved. Thank you!"
233
 
234
 
235
  # ── UI ──────────────────────────────────────────────────────────────────────────
 
325
 
326
  with gr.Tab("Meta-Review"):
327
  meta_display = gr.Markdown("*Load a paper to see the meta-review.*")
328
+ gr.Markdown("---")
329
+ rate_meta_btn = gr.Button("AI Rate Meta-Review", variant="primary")
330
+ gr.Markdown("### Meta-Review Analysis")
331
+ meta_result_display = gr.Markdown("")
332
 
333
  # ── Wire events ────────────────────────────────────────────────────────────
334
+ load_btn.click(
335
+ load_paper,
336
+ [paper_id_box],
337
+ [reviewer_dd, paper_info, meta_display, result_display],
338
+ )
339
  reviewer_dd.change(show_review, [paper_id_box, reviewer_dd], [review_display])
340
 
341
  rate_one_btn.click(
 
348
  [paper_id_box, api_key_box],
349
  [result_display, feedback_panel],
350
  )
351
+ rate_meta_btn.click(
352
+ run_metareview_rating,
353
+ [paper_id_box, api_key_box],
354
+ [meta_result_display],
355
+ )
356
  submit_fb_btn.click(
357
  submit_feedback,
358
  [satisfaction, correct_label, comment],
 
362
 
363
  if __name__ == "__main__":
364
  demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rater.py CHANGED
@@ -259,6 +259,291 @@ def rate_review(review_content: str, initial_rating: str, final_rating: str,
259
  return result
260
 
261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  def format_result_markdown(reviewer_id: str, result: dict) -> str:
263
  label = result.get("main_label", "?")
264
  icon = LABEL_COLORS.get(label, "⚫")
 
259
  return result
260
 
261
 
262
+ # ══════════════════════════════════════════════════════════════════════════════
263
+ # META-REVIEW PROMPT
264
+ # ══════════════════════════════════════════════════════════════════════════════
265
+
266
+ META_SYSTEM_PROMPT = """You are an expert evaluator of reasoning in academic meta-review.
267
+
268
+ Your task is to classify a given meta-review passage according to Daniel Kahneman's dual-process framework from *Thinking, Fast and Slow*, adapted to the context of academic review aggregation and decision-making.
269
+
270
+ A meta-review is NOT the same as an ordinary review.
271
+ In addition to object-level reasoning, a meta-review may perform higher-order reasoning by:
272
+ - aggregating reviewer evidence,
273
+ - handling disagreement,
274
+ - recording rebuttal-driven updates,
275
+ - synthesizing concerns into a final decision,
276
+ - and making an institutional recommendation traceable to prior evidence.
277
+
278
+ You are NOT judging whether the final decision is correct.
279
+ You are judging what kind of cognitive process the meta-review reflects, how much analytical effort it shows, how well it aggregates evidence, and how transparent its final decision is.
280
+
281
+ ==================================================
282
+ SECTION 1: LABEL SPACE
283
+ ==================================================
284
+
285
+ Assign exactly one main label:
286
+
287
+ 1. System 1
288
+ 2. System 2
289
+ 3. Mixed
290
+ 4. Non-evaluative
291
+
292
+ Definitions:
293
+
294
+ System 1:
295
+ The meta-review is primarily intuitive, compressed, shortcut-driven, authority-dependent, venue-fit driven, or conclusion-first.
296
+ It may summarize concerns vaguely, rely on global impressions, defer to others without real synthesis, or make a final decision that is weakly traceable to evidence.
297
+
298
+ System 2:
299
+ The meta-review performs explicit, careful, evidence-based synthesis.
300
+ It accurately aggregates reviewer concerns, distinguishes resolved vs unresolved issues, handles disagreement explicitly, reflects rebuttal-driven updates, and makes the final decision traceable to the preceding evidence.
301
+
302
+ Mixed:
303
+ The meta-review contains substantial signals of both System 1 and System 2.
304
+ For example, it may contain real synthesis but still end with a compressed venue-fit judgment, or summarize reviewer evidence carefully but ignore disagreement.
305
+
306
+ Non-evaluative:
307
+ The meta-review contains little or no meaningful evaluative reasoning.
308
+ Examples include purely administrative text, placeholder content, or a decision with no supporting reasoning.
309
+
310
+ ==================================================
311
+ SECTION 2: CORE JUDGING PRINCIPLES
312
+ ==================================================
313
+
314
+ 1. Do NOT judge based on decision correctness.
315
+ A meta-review that recommends rejection can still be System 2 if its reasoning is explicit and traceable.
316
+
317
+ 2. Do NOT judge based on length.
318
+ A short meta-review can be System 2 if it is precise and well-grounded.
319
+ A long meta-review can be System 1 if it is repetitive or impressionistic.
320
+
321
+ 3. DO judge based on aggregation quality.
322
+ Does the meta-reviewer accurately represent what the reviewers said?
323
+ Does it distinguish between reviewers who were convinced by the rebuttal and those who were not?
324
+
325
+ 4. DO judge based on decision traceability.
326
+ Can the final recommendation be traced back to specific evidence or reasoning in the meta-review?
327
+ Or does it appear as a conclusion without derivation?
328
+
329
+ 5. DO judge based on disagreement handling.
330
+ When reviewers disagree, does the meta-reviewer engage with the disagreement explicitly?
331
+ Or does it average, ignore, or defer to the majority without reasoning?
332
+
333
+ 6. DO judge based on rebuttal integration.
334
+ Does the meta-review reflect what changed (or did not change) after the rebuttal?
335
+ Or does it treat the rebuttal as irrelevant?
336
+
337
+ ==================================================
338
+ SECTION 3: DIMENSION SCORES
339
+ ==================================================
340
+
341
+ Score the meta-review on 6 dimensions (1–5 each):
342
+
343
+ D1. Aggregation Accuracy
344
+ Does the meta-reviewer accurately represent the reviewers' concerns?
345
+ 1 = Misrepresents or ignores reviewer content
346
+ 3 = Partially accurate, some omissions or distortions
347
+ 5 = Accurate and complete representation of reviewer positions
348
+
349
+ D2. Disagreement Handling
350
+ When reviewers disagree, does the meta-reviewer engage with the disagreement?
351
+ 1 = Ignores disagreement or averages without reasoning
352
+ 3 = Acknowledges disagreement but does not resolve it
353
+ 5 = Explicitly engages with disagreement and explains how it was resolved
354
+
355
+ D3. Rebuttal Integration
356
+ Does the meta-review reflect what changed after the rebuttal?
357
+ 1 = No mention of rebuttal or its effects
358
+ 3 = Mentions rebuttal but does not specify what changed
359
+ 5 = Explicitly states which concerns were resolved and which remain
360
+
361
+ D4. Decision Traceability
362
+ Can the final recommendation be traced to specific evidence?
363
+ 1 = Decision appears without derivation
364
+ 3 = Some connection between evidence and decision
365
+ 5 = Decision is fully traceable to specific prior reasoning
366
+
367
+ D5. Reasoning Explicitness
368
+ Does the meta-reviewer show their reasoning, or only state conclusions?
369
+ 1 = Pure assertion
370
+ 3 = Some reasoning present but incomplete
371
+ 5 = Fully explicit reasoning chains
372
+
373
+ D6. Synthesis Quality
374
+ Does the meta-review go beyond summarizing individual reviews to produce a coherent synthesis?
375
+ 1 = No synthesis, just a list of reviewer opinions
376
+ 3 = Some synthesis present
377
+ 5 = Coherent synthesis that integrates multiple perspectives into a unified assessment
378
+
379
+ ==================================================
380
+ SECTION 4: BIAS FLAGS
381
+ ==================================================
382
+
383
+ Flag any of the following if clearly evidenced:
384
+
385
+ - MAJORITY_DEFERENCE: Decision follows reviewer majority without independent reasoning
386
+ - AUTHORITY_DEFERENCE: Defers to a senior reviewer or author reputation without analysis
387
+ - VENUE_FIT: Decision based on fit to venue rather than paper quality
388
+ - REBUTTAL_DISMISSAL: Rebuttal ignored or dismissed without engagement
389
+ - AGGREGATION_COMPRESSION: Reviewer concerns compressed into vague summary losing specificity
390
+ - CONCLUSION_FIRST: Final decision stated before or without supporting reasoning
391
+ - SELECTIVE_SYNTHESIS: Only engages with evidence supporting the final decision
392
+ - OVERCONFIDENCE: Certainty expressed beyond what the evidence supports
393
+
394
+ ==================================================
395
+ SECTION 5: REASONING QUALITY SCORE
396
+ ==================================================
397
+
398
+ Assign a single overall Reasoning Quality Score from 1 to 10.
399
+
400
+ 1–2: No meaningful reasoning. Pure assertion or administrative text.
401
+ 3–4: Minimal reasoning. Mostly impressionistic with occasional specifics.
402
+ 5–6: Moderate reasoning. Some structured analysis but significant gaps.
403
+ 7–8: Strong reasoning. Mostly explicit, grounded, and well-aggregated.
404
+ 9–10: Exceptional reasoning. Fully explicit, traceable, and systematically structured.
405
+
406
+ ==================================================
407
+ OUTPUT FORMAT
408
+ ==================================================
409
+
410
+ Respond with ONLY a valid JSON object. No markdown fences, no explanation outside the JSON.
411
+
412
+ {
413
+ "main_label": "<System 1 | System 2 | Mixed | Non-evaluative>",
414
+ "label_confidence": "<high | medium | low>",
415
+ "system1_score": <0.0-1.0>,
416
+ "system2_score": <0.0-1.0>,
417
+ "reasoning_quality_score": <1-10>,
418
+ "dimension_scores": {
419
+ "aggregation_accuracy": <1-5>,
420
+ "disagreement_handling": <1-5>,
421
+ "rebuttal_integration": <1-5>,
422
+ "decision_traceability": <1-5>,
423
+ "reasoning_explicitness": <1-5>,
424
+ "synthesis_quality": <1-5>
425
+ },
426
+ "bias_flags": ["<BIAS_NAME>", ...],
427
+ "key_system1_signals": ["<signal>", ...],
428
+ "key_system2_signals": ["<signal>", ...],
429
+ "most_diagnostic_quote": "<exact quote from meta-review>",
430
+ "brief_rationale": "<2-3 sentences explaining the classification>"
431
+ }
432
+
433
+ Notes on system1_score and system2_score:
434
+ - Both are continuous values in [0.0, 1.0].
435
+ - They do NOT need to sum to 1.0.
436
+ - System 1 dominant: system1_score > 0.7, system2_score < 0.3
437
+ - System 2 dominant: system2_score > 0.7, system1_score < 0.3
438
+ - Mixed: both moderate (0.3–0.7)
439
+ - Non-evaluative: both low (<0.3)"""
440
+
441
+ META_DIMENSION_LABELS = {
442
+ "aggregation_accuracy": "Aggregation Accuracy",
443
+ "disagreement_handling": "Disagreement Handling",
444
+ "rebuttal_integration": "Rebuttal Integration",
445
+ "decision_traceability": "Decision Traceability",
446
+ "reasoning_explicitness": "Reasoning Explicitness",
447
+ "synthesis_quality": "Synthesis Quality",
448
+ }
449
+
450
+ META_WEIGHTS = {
451
+ "aggregation_accuracy": 0.20,
452
+ "disagreement_handling": 0.15,
453
+ "rebuttal_integration": 0.15,
454
+ "decision_traceability": 0.20,
455
+ "reasoning_explicitness": 0.15,
456
+ "synthesis_quality": 0.15,
457
+ }
458
+
459
+
460
+ def rate_metareview(metareview_content: str, decision: str,
461
+ conference: str, api_key: str) -> dict:
462
+ client = anthropic.Anthropic(api_key=api_key)
463
+ prompt = f"""Conference: {conference}
464
+ Decision: {decision}
465
+
466
+ Meta-review text:
467
+ {metareview_content[:4000]}"""
468
+ msg = client.messages.create(
469
+ model="claude-sonnet-4-6",
470
+ max_tokens=1400,
471
+ messages=[{"role": "user", "content": META_SYSTEM_PROMPT + "\n\n" + prompt}],
472
+ )
473
+ raw = msg.content[0].text
474
+ result = _parse_json(raw)
475
+ result["composite_score"] = round(
476
+ sum(META_WEIGHTS[k] * result.get("dimension_scores", {}).get(k, 3)
477
+ for k in META_WEIGHTS), 2
478
+ )
479
+ result["derived"] = compute_derived_metrics(result)
480
+ result["_raw_response"] = raw
481
+ return result
482
+
483
+
484
+ def format_metareview_result_markdown(result: dict) -> str:
485
+ label = result.get("main_label", "?")
486
+ icon = LABEL_COLORS.get(label, "⚫")
487
+ rqs = result.get("reasoning_quality_score", "?")
488
+ composite = result.get("composite_score", "?")
489
+ confidence = result.get("label_confidence", "?")
490
+ rationale = result.get("brief_rationale", "")
491
+ quote = result.get("most_diagnostic_quote", "")
492
+ bias_flags = result.get("bias_flags", [])
493
+ s1_signals = result.get("key_system1_signals", [])
494
+ s2_signals = result.get("key_system2_signals", [])
495
+ ds = result.get("dimension_scores", {})
496
+ derived = result.get("derived", {})
497
+
498
+ s1 = result.get("system1_score", "?")
499
+ s2 = result.get("system2_score", "?")
500
+ hd = derived.get("heuristic_dominance", "?")
501
+ ast = derived.get("analytic_strength", "?")
502
+ mix = derived.get("mixedness", "?")
503
+
504
+ filled = int(round((composite - 1) / 4 * 20)) if isinstance(composite, (int, float)) else 0
505
+ bar = "█" * filled + "░" * (20 - filled)
506
+
507
+ lines = [
508
+ "### Area Chair Meta-Review Analysis",
509
+ f"**Classification:** {icon} **{label}** · Confidence: {confidence}",
510
+ f"**Reasoning Quality Score:** {rqs} / 10",
511
+ f"**Dimension Composite:** `{bar}` {composite} / 5.00",
512
+ "",
513
+ "| Dimension | Score |",
514
+ "|-----------|------:|",
515
+ ]
516
+
517
+ for key, label_str in META_DIMENSION_LABELS.items():
518
+ score = ds.get(key, "?")
519
+ lines.append(f"| {label_str} | {score}/5 |")
520
+
521
+ lines += [
522
+ "",
523
+ "**Derived Metrics:**",
524
+ "| Metric | Value | Interpretation |",
525
+ "|--------|------:|----------------|",
526
+ f"| System 1 Score | {s1} | Degree of heuristic/intuitive reasoning (0–1) |",
527
+ f"| System 2 Score | {s2} | Degree of analytical/deliberate reasoning (0–1) |",
528
+ f"| Heuristic Dominance | {hd} | S1 − S2 · positive = more System 1 |",
529
+ f"| Analytic Strength | {ast} | RQS / 10 · overall reasoning quality |",
530
+ f"| Mixedness | {mix} | min(S1, S2) · how much both modes coexist |",
531
+ ]
532
+
533
+ if bias_flags:
534
+ lines += ["", "**Bias Flags:** " + " ".join(f"`{b}`" for b in bias_flags)]
535
+ if s2_signals:
536
+ lines += ["", "**System 2 signals:** " + " · ".join(f"*{s}*" for s in s2_signals)]
537
+ if s1_signals:
538
+ lines += ["", "**System 1 signals:** " + " · ".join(f"*{s}*" for s in s1_signals)]
539
+ if quote:
540
+ lines += ["", f"**Most diagnostic quote:** *\"{quote}\"*"]
541
+ if rationale:
542
+ lines += ["", f"**Rationale:** {rationale}"]
543
+
544
+ return "\n".join(lines)
545
+
546
+
547
  def format_result_markdown(reviewer_id: str, result: dict) -> str:
548
  label = result.get("main_label", "?")
549
  icon = LABEL_COLORS.get(label, "⚫")