GRM / benchmarks.py
mbagdasarova-nvidia's picture
Add GRM evaluation suite with scoring logic and benchmark registry
2a44234
raw
history blame
18.1 kB
"""
GRM Evaluation Suite — Benchmark Registry
Each benchmark is defined with:
- name: Display name
- category: ROLEPLAY | ACTIONS | GENERAL
- description: Short description of what the benchmark tests
- calc_weight: 1.0 (core) or 0.5 (supplementary)
- summary: Multi-sentence methodology summary for display
- paper: URL to the paper or resource (if applicable)
"""
BENCHMARKS = [
# ── ROLEPLAY (33%) ──────────────────────────────────────────────
{
"name": "MultiChallenge",
"category": "ROLEPLAY",
"calc_weight": 1.0,
"description": "Multi-turn instruction following & coherence across turns",
"summary": (
"MultiChallenge evaluates multi-turn instruction following where models must "
"maintain coherence across multiple challenging conversational turns. Each test "
"scenario involves complex, multi-constraint instructions that require the model "
"to track context, resolve references, and keep all prior commitments intact while "
"handling new user requests."
),
"paper": None,
},
{
"name": "RoleBench",
"category": "ROLEPLAY",
"calc_weight": 1.0,
"description": "Character-level roleplaying with explicit role framing",
"summary": (
"RoleBench is the first systematic and fine-grained character-level benchmark "
"for role-playing, comprising 168,093 samples covering 100 distinct roles. "
"Created via the RoleLLM framework using Context-Instruct for role-specific "
"knowledge extraction and RoleGPT for speaking style imitation. Evaluates a "
"model's ability to maintain character persona, domain knowledge, and consistent "
"speaking style throughout interactions."
),
"paper": "https://arxiv.org/abs/2310.00746",
},
{
"name": "GRM-Bench \u2014 Coherence",
"category": "ROLEPLAY",
"calc_weight": 1.0,
"description": "Logically sound and coherent across turns, without contradictions",
"summary": (
"Nvidia-authored benchmark testing resistance to incoherence in gaming dialogue. "
"Scenarios are crafted to invoke common coherence failures, then a model's "
"resilience is measured. Detection covers eight categories: factual/logical errors, "
"cause-effect failures, self-contradiction, personality/background violations, "
"role confusion, irrelevance, knowledge boundary violations, and false premise "
"acceptance."
),
"paper": None,
},
{
"name": "GRM-Bench \u2014 Response Diversity",
"category": "ROLEPLAY",
"calc_weight": 1.0,
"description": "Avoids using repetitive language and speech structure",
"summary": (
"Nvidia-authored benchmark measuring whether models avoid repetitive language "
"patterns, vocabulary, and sentence structures across varied dialogue exchanges. "
"Evaluates lexical diversity, syntactic variation, and stylistic range when a "
"character is placed in different conversation contexts."
),
"paper": None,
},
{
"name": "GRM-Bench \u2014 Context Adaption",
"category": "ROLEPLAY",
"calc_weight": 1.0,
"description": "Using latest knowledge/variable updates even if changed during convo",
"summary": (
"Nvidia-authored benchmark testing whether models correctly incorporate the latest "
"game-state and knowledge updates, even when facts change mid-conversation. "
"Scenarios involve dynamic variable mutations (e.g. inventory changes, NPC status "
"updates) and verify the model references the current state rather than stale data."
),
"paper": None,
},
{
"name": "DialogueNLI",
"category": "ROLEPLAY",
"calc_weight": 0.5,
"description": "Checks contradiction/consistency crumbling",
"summary": (
"Dialogue Natural Language Inference dataset for evaluating consistency in "
"dialogue agents. Uses NLI-style classification (entailment / contradiction / "
"neutral) to detect when a dialogue agent contradicts its established persona "
"or previous statements. Derived from the Persona-Chat dataset with human-"
"annotated sentence pairs."
),
"paper": "https://arxiv.org/abs/1811.00671",
},
{
"name": "RoleMRC",
"category": "ROLEPLAY",
"calc_weight": 0.5,
"description": "Follow complex nested instructions while remaining in character",
"summary": (
"Role-based Machine Reading Comprehension benchmark that tests the ability to "
"follow complex, nested instructions while remaining fully in character. Combines "
"reading comprehension challenges with role-playing constraints, requiring models "
"to extract and reason about information without breaking persona."
),
"paper": None,
},
{
"name": "EQBench v3",
"category": "ROLEPLAY",
"calc_weight": 0.5,
"description": "Detect nuances in tone/intent and modulate response accordingly",
"summary": (
"Emotional Quotient Benchmark v3 assesses a model's emotional intelligence — "
"specifically the ability to detect nuances in tone, intent, and emotional subtext, "
"and to modulate responses accordingly. Tests include recognizing sarcasm, empathy "
"calibration, emotional escalation/de-escalation, and context-appropriate tonal "
"shifts."
),
"paper": "https://eqbench.com/",
},
# ── ACTIONS (33%) ───────────────────────────────────────────────
{
"name": "BFCLv3",
"category": "ACTIONS",
"calc_weight": 1.0,
"description": "Serial/parallel tool calling, multi-step settings",
"summary": (
"Berkeley Function-Calling Leaderboard v3 evaluates serial and parallel tool "
"calling in multi-step settings across multiple programming languages and complex "
"function schemas. Tests include simple, multiple, parallel, and nested function "
"calls, as well as function relevance detection (knowing when no tool applies)."
),
"paper": "https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html",
},
{
"name": "Tau2-Bench",
"category": "ACTIONS",
"calc_weight": 1.0,
"description": "Multi-turn interactions w/ real-world commercial operations",
"summary": (
"\u03c4\u00b2-Bench from Sierra Research is a multi-turn agentic benchmark using "
"dual-control agent-user simulation for testing tool use in real-world commercial "
"operations. The telecom domain contains 114 programmatically generated tasks "
"with varying intents (service, mobile data, MMS). The outcome world-state "
"determines success — e.g. whether Data is functioning after agent completion."
),
"paper": "https://arxiv.org/abs/2506.07982",
},
{
"name": "ToolSandbox",
"category": "ACTIONS",
"calc_weight": 1.0,
"description": "Stateful dependencies + conversational tool calling",
"summary": (
"Apple's stateful, conversational, interactive evaluation benchmark for LLM tool "
"use. Includes stateful tool execution with implicit state dependencies between "
"tools, a built-in user simulator supporting on-policy conversational evaluation, "
"and dynamic evaluation of intermediate and final milestones. Tests canonicalization, "
"insufficient information handling, and complex state management."
),
"paper": "https://arxiv.org/abs/2408.04682",
},
{
"name": "When2Call",
"category": "ACTIONS",
"calc_weight": 1.0,
"description": "Tool call timing \u2014 when to trigger, when to follow-up, etc.",
"summary": (
"Evaluates tool-call timing decisions: knowing when to invoke a tool, when to ask "
"for clarification first, and when to provide a direct answer without tools. Tests "
"the critical judgment of whether a function call is appropriate given the current "
"conversational context and available information."
),
"paper": None,
},
{
"name": "GRM-Bench \u2014 Prompt Robustness",
"category": "ACTIONS",
"calc_weight": 1.0,
"description": "Same prompt expressed differently still invoking intended tools",
"summary": (
"Nvidia-authored benchmark testing whether semantically equivalent prompts "
"expressed in different phrasings, formality levels, and syntactic structures "
"still correctly invoke the intended tools and actions. Measures robustness of "
"tool-call intent recognition against natural language variation."
),
"paper": None,
},
{
"name": "BFCLv4",
"category": "ACTIONS",
"calc_weight": 0.5,
"description": "Adds memory into the loop, and tests format sensitivity",
"summary": (
"Berkeley Function-Calling Leaderboard v4 extends v3 with memory-augmented tool "
"calling scenarios and tests sensitivity to format variations in function schemas. "
"Evaluates how well models handle evolving context windows and maintain tool-call "
"accuracy when schema formats shift."
),
"paper": "https://gorilla.cs.berkeley.edu/blogs/12_bfcl_v3_multi_turn.html",
},
{
"name": "T-Eval",
"category": "ACTIONS",
"calc_weight": 0.5,
"description": "Step-by-step tool use and orchestration, logical tool decomposition",
"summary": (
"T-Eval evaluates step-by-step tool use and orchestration capabilities. Tests "
"logical decomposition of complex tasks into tool-calling sequences, including "
"plan generation, tool selection, argument filling, and response summarization. "
"Provides fine-grained analysis of where in the tool-use pipeline models fail."
),
"paper": "https://arxiv.org/abs/2312.14033",
},
# ── GENERAL (33%) ──────────────────────────────────────────────
{
"name": "RULER",
"category": "GENERAL",
"calc_weight": 1.0,
"description": "Needle-in-haystack + polluted state stress test",
"summary": (
"RULER (Real-world Understanding of Long-context and Evaluation through Retrieval) "
"extends needle-in-a-haystack testing with multiple retrieval types, multi-hop "
"composition, and aggregation tasks at varying context lengths. Includes polluted "
"state and distractor injection to stress-test long-context faithfulness."
),
"paper": "https://arxiv.org/abs/2404.06654",
},
{
"name": "GaRAGe",
"category": "GENERAL",
"calc_weight": 1.0,
"description": "Deflect/refuse action when state is insufficient/corrupted",
"summary": (
"GaRAGe (Grounded and Attributed RAG Evaluation) tests a model's ability to "
"deflect or refuse action when the provided retrieval context is insufficient, "
"corrupted, or contradictory. Evaluates robustness against adversarial or low-"
"quality retrieved passages and the model's capacity to say 'I don't know' rather "
"than hallucinate an answer."
),
"paper": None,
},
{
"name": "IFBench",
"category": "GENERAL",
"calc_weight": 0.5,
"description": "Generic instruction following, not as prone to overfit vs IFEval",
"summary": (
"IFBench from AllenAI evaluates precise instruction following with 294 single-turn "
"questions testing counting, formatting, and sentence manipulation. Uses a loose "
"evaluation mode that accounts for extraneous text or formatting. Designed to be "
"less prone to overfitting compared to IFEval."
),
"paper": "https://arxiv.org/abs/2507.02833",
},
{
"name": "AA LCR",
"category": "GENERAL",
"calc_weight": 0.5,
"description": "Complex reasoning across long contexts",
"summary": (
"Artificial Analysis Long Context Reasoning benchmark with 100 hard text-based "
"questions spanning 7 document categories (Company Reports, Industry Reports, "
"Government Consultations, Academia, Legal, Marketing, Surveys). Requires ~100K "
"tokens of input per question, demanding reasoning across multiple long documents."
),
"paper": "https://artificialanalysis.ai/methodology/intelligence-benchmarking",
},
{
"name": "StructEval-T",
"category": "GENERAL",
"calc_weight": 0.5,
"description": "Format-following",
"summary": (
"StructEval-T evaluates structured output and format-following capabilities. "
"Tests whether models can adhere to specified output templates, formatting "
"constraints (JSON, XML, Markdown, tables), and structural requirements while "
"maintaining content accuracy."
),
"paper": None,
},
{
"name": "InverseIFEval",
"category": "GENERAL",
"calc_weight": 0.5,
"description": "Unconventional instruction following",
"summary": (
"Tests unconventional and counter-intuitive instruction following where models "
"must comply with unusual or inverted constraints. Evaluates whether models can "
"faithfully execute instructions that go against typical patterns, such as "
"intentionally producing specific error formats or following negative constraints."
),
"paper": None,
},
{
"name": "RAGTruth",
"category": "GENERAL",
"calc_weight": 0.5,
"description": "Hallucinations relative to retrieved context",
"summary": (
"RAGTruth benchmarks hallucination detection and prevention in RAG pipelines. "
"Evaluates whether models faithfully ground responses in provided retrieved "
"documents rather than generating unsupported claims. Covers summary-level and "
"sentence-level faithfulness across diverse document types."
),
"paper": "https://arxiv.org/abs/2401.00396",
},
{
"name": "SpatialText",
"category": "GENERAL",
"calc_weight": 0.5,
"description": "Text-based spatial cognition",
"summary": (
"SpatialText tests text-based spatial cognition — understanding spatial "
"relationships, positions, orientations, and arrangements described purely through "
"natural language. Critical for gaming scenarios involving navigation, object "
"placement, and environmental descriptions."
),
"paper": None,
},
{
"name": "SpartQA",
"category": "GENERAL",
"calc_weight": 0.5,
"description": "Spatial reasoning with textual spatial descriptions and Q&A",
"summary": (
"SpartQA is a question-answering benchmark for spatial reasoning from textual "
"descriptions. Models must understand object positions, relative locations, "
"containment, and spatial logic described in natural language passages, then answer "
"questions requiring spatial inference. Includes FindRelation, FindBlock, and "
"YesNo question types."
),
"paper": "https://arxiv.org/abs/2104.05832",
},
{
"name": "COPA",
"category": "GENERAL",
"calc_weight": 0.5,
"description": "Premise + choose plausible cause/effect",
"summary": (
"Choice of Plausible Alternatives (COPA) is a classic commonsense causal reasoning "
"benchmark. Given a premise, the model must select the more plausible cause or "
"effect from two options. Tests open-domain commonsense causal reasoning critical "
"for narrative coherence in game dialogue."
),
"paper": "https://people.ict.usc.edu/~gordon/copa.html",
},
{
"name": "PIQA",
"category": "GENERAL",
"calc_weight": 0.5,
"description": "Physical interaction common sense",
"summary": (
"Physical Interaction Question Answering tests physical commonsense knowledge "
"about everyday objects, their properties, affordances, and interactions. Each "
"question presents a goal and two solutions; the model must select the physically "
"plausible one. Relevant for game NPCs reasoning about physical world interactions."
),
"paper": "https://arxiv.org/abs/1911.11641",
},
]
CATEGORIES = ["ROLEPLAY", "ACTIONS", "GENERAL"]
CATEGORY_WEIGHT = 1 / 3 # Each category contributes 33.3%
CATEGORY_DISPLAY = {
"ROLEPLAY": "Roleplay",
"ACTIONS": "Actions",
"GENERAL": "General",
}
def get_benchmarks_by_category(category: str) -> list[dict]:
return [b for b in BENCHMARKS if b["category"] == category]
def get_all_benchmark_names() -> list[str]:
return [b["name"] for b in BENCHMARKS]