""" GRM Evaluation Suite — Benchmark Registry Each benchmark is defined with: - name: Display name - category: ROLEPLAY | ACTIONS | GENERAL - description: Short description of what the benchmark tests - calc_weight: 1.0 (core) or 0.5 (supplementary) - summary: Multi-sentence methodology summary for display - paper: URL to the paper or resource (if applicable) """ BENCHMARKS = [ # ── ROLEPLAY (33%) ────────────────────────────────────────────── { "name": "MultiChallenge", "category": "ROLEPLAY", "calc_weight": 1.0, "description": "Multi-turn instruction following & coherence across turns", "summary": ( "MultiChallenge evaluates multi-turn instruction following where models must " "maintain coherence across multiple challenging conversational turns. Each test " "scenario involves complex, multi-constraint instructions that require the model " "to track context, resolve references, and keep all prior commitments intact while " "handling new user requests." ), "paper": None, }, { "name": "RoleBench", "category": "ROLEPLAY", "calc_weight": 1.0, "description": "Character-level roleplaying with explicit role framing", "summary": ( "RoleBench is the first systematic and fine-grained character-level benchmark " "for role-playing, comprising 168,093 samples covering 100 distinct roles. " "Created via the RoleLLM framework using Context-Instruct for role-specific " "knowledge extraction and RoleGPT for speaking style imitation. Evaluates a " "model's ability to maintain character persona, domain knowledge, and consistent " "speaking style throughout interactions." ), "paper": "https://arxiv.org/abs/2310.00746", }, { "name": "GRM-Bench \u2014 Coherence", "category": "ROLEPLAY", "calc_weight": 1.0, "description": "Logically sound and coherent across turns, without contradictions", "summary": ( "Nvidia-authored benchmark testing resistance to incoherence in gaming dialogue. " "Scenarios are crafted to invoke common coherence failures, then a model's " "resilience is measured. Detection covers eight categories: factual/logical errors, " "cause-effect failures, self-contradiction, personality/background violations, " "role confusion, irrelevance, knowledge boundary violations, and false premise " "acceptance." ), "paper": None, }, { "name": "GRM-Bench \u2014 Response Diversity", "category": "ROLEPLAY", "calc_weight": 1.0, "description": "Avoids using repetitive language and speech structure", "summary": ( "Nvidia-authored benchmark measuring whether models avoid repetitive language " "patterns, vocabulary, and sentence structures across varied dialogue exchanges. " "Evaluates lexical diversity, syntactic variation, and stylistic range when a " "character is placed in different conversation contexts." ), "paper": None, }, { "name": "GRM-Bench \u2014 Context Adaption", "category": "ROLEPLAY", "calc_weight": 1.0, "description": "Using latest knowledge/variable updates even if changed during convo", "summary": ( "Nvidia-authored benchmark testing whether models correctly incorporate the latest " "game-state and knowledge updates, even when facts change mid-conversation. " "Scenarios involve dynamic variable mutations (e.g. inventory changes, NPC status " "updates) and verify the model references the current state rather than stale data." ), "paper": None, }, { "name": "DialogueNLI", "category": "ROLEPLAY", "calc_weight": 0.5, "description": "Checks contradiction/consistency crumbling", "summary": ( "Dialogue Natural Language Inference dataset for evaluating consistency in " "dialogue agents. Uses NLI-style classification (entailment / contradiction / " "neutral) to detect when a dialogue agent contradicts its established persona " "or previous statements. Derived from the Persona-Chat dataset with human-" "annotated sentence pairs." ), "paper": "https://arxiv.org/abs/1811.00671", }, { "name": "RoleMRC", "category": "ROLEPLAY", "calc_weight": 0.5, "description": "Follow complex nested instructions while remaining in character", "summary": ( "Role-based Machine Reading Comprehension benchmark that tests the ability to " "follow complex, nested instructions while remaining fully in character. Combines " "reading comprehension challenges with role-playing constraints, requiring models " "to extract and reason about information without breaking persona." ), "paper": None, }, { "name": "EQBench v3", "category": "ROLEPLAY", "calc_weight": 0.5, "description": "Detect nuances in tone/intent and modulate response accordingly", "summary": ( "Emotional Quotient Benchmark v3 assesses a model's emotional intelligence — " "specifically the ability to detect nuances in tone, intent, and emotional subtext, " "and to modulate responses accordingly. Tests include recognizing sarcasm, empathy " "calibration, emotional escalation/de-escalation, and context-appropriate tonal " "shifts." ), "paper": "https://eqbench.com/", }, # ── ACTIONS (33%) ─────────────────────────────────────────────── { "name": "BFCLv3", "category": "ACTIONS", "calc_weight": 1.0, "description": "Serial/parallel tool calling, multi-step settings", "summary": ( "Berkeley Function-Calling Leaderboard v3 evaluates serial and parallel tool " "calling in multi-step settings across multiple programming languages and complex " "function schemas. Tests include simple, multiple, parallel, and nested function " "calls, as well as function relevance detection (knowing when no tool applies)." ), "paper": "https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html", }, { "name": "Tau2-Bench", "category": "ACTIONS", "calc_weight": 1.0, "description": "Multi-turn interactions w/ real-world commercial operations", "summary": ( "\u03c4\u00b2-Bench from Sierra Research is a multi-turn agentic benchmark using " "dual-control agent-user simulation for testing tool use in real-world commercial " "operations. The telecom domain contains 114 programmatically generated tasks " "with varying intents (service, mobile data, MMS). The outcome world-state " "determines success — e.g. whether Data is functioning after agent completion." ), "paper": "https://arxiv.org/abs/2506.07982", }, { "name": "ToolSandbox", "category": "ACTIONS", "calc_weight": 1.0, "description": "Stateful dependencies + conversational tool calling", "summary": ( "Apple's stateful, conversational, interactive evaluation benchmark for LLM tool " "use. Includes stateful tool execution with implicit state dependencies between " "tools, a built-in user simulator supporting on-policy conversational evaluation, " "and dynamic evaluation of intermediate and final milestones. Tests canonicalization, " "insufficient information handling, and complex state management." ), "paper": "https://arxiv.org/abs/2408.04682", }, { "name": "When2Call", "category": "ACTIONS", "calc_weight": 1.0, "description": "Tool call timing \u2014 when to trigger, when to follow-up, etc.", "summary": ( "Evaluates tool-call timing decisions: knowing when to invoke a tool, when to ask " "for clarification first, and when to provide a direct answer without tools. Tests " "the critical judgment of whether a function call is appropriate given the current " "conversational context and available information." ), "paper": None, }, { "name": "GRM-Bench \u2014 Prompt Robustness", "category": "ACTIONS", "calc_weight": 1.0, "description": "Same prompt expressed differently still invoking intended tools", "summary": ( "Nvidia-authored benchmark testing whether semantically equivalent prompts " "expressed in different phrasings, formality levels, and syntactic structures " "still correctly invoke the intended tools and actions. Measures robustness of " "tool-call intent recognition against natural language variation." ), "paper": None, }, { "name": "BFCLv4", "category": "ACTIONS", "calc_weight": 0.5, "description": "Adds memory into the loop, and tests format sensitivity", "summary": ( "Berkeley Function-Calling Leaderboard v4 extends v3 with memory-augmented tool " "calling scenarios and tests sensitivity to format variations in function schemas. " "Evaluates how well models handle evolving context windows and maintain tool-call " "accuracy when schema formats shift." ), "paper": "https://gorilla.cs.berkeley.edu/blogs/12_bfcl_v3_multi_turn.html", }, { "name": "T-Eval", "category": "ACTIONS", "calc_weight": 0.5, "description": "Step-by-step tool use and orchestration, logical tool decomposition", "summary": ( "T-Eval evaluates step-by-step tool use and orchestration capabilities. Tests " "logical decomposition of complex tasks into tool-calling sequences, including " "plan generation, tool selection, argument filling, and response summarization. " "Provides fine-grained analysis of where in the tool-use pipeline models fail." ), "paper": "https://arxiv.org/abs/2312.14033", }, # ── GENERAL (33%) ────────────────────────────────────────────── { "name": "RULER", "category": "GENERAL", "calc_weight": 1.0, "description": "Needle-in-haystack + polluted state stress test", "summary": ( "RULER (Real-world Understanding of Long-context and Evaluation through Retrieval) " "extends needle-in-a-haystack testing with multiple retrieval types, multi-hop " "composition, and aggregation tasks at varying context lengths. Includes polluted " "state and distractor injection to stress-test long-context faithfulness." ), "paper": "https://arxiv.org/abs/2404.06654", }, { "name": "GaRAGe", "category": "GENERAL", "calc_weight": 1.0, "description": "Deflect/refuse action when state is insufficient/corrupted", "summary": ( "GaRAGe (Grounded and Attributed RAG Evaluation) tests a model's ability to " "deflect or refuse action when the provided retrieval context is insufficient, " "corrupted, or contradictory. Evaluates robustness against adversarial or low-" "quality retrieved passages and the model's capacity to say 'I don't know' rather " "than hallucinate an answer." ), "paper": None, }, { "name": "IFBench", "category": "GENERAL", "calc_weight": 0.5, "description": "Generic instruction following, not as prone to overfit vs IFEval", "summary": ( "IFBench from AllenAI evaluates precise instruction following with 294 single-turn " "questions testing counting, formatting, and sentence manipulation. Uses a loose " "evaluation mode that accounts for extraneous text or formatting. Designed to be " "less prone to overfitting compared to IFEval." ), "paper": "https://arxiv.org/abs/2507.02833", }, { "name": "AA LCR", "category": "GENERAL", "calc_weight": 0.5, "description": "Complex reasoning across long contexts", "summary": ( "Artificial Analysis Long Context Reasoning benchmark with 100 hard text-based " "questions spanning 7 document categories (Company Reports, Industry Reports, " "Government Consultations, Academia, Legal, Marketing, Surveys). Requires ~100K " "tokens of input per question, demanding reasoning across multiple long documents." ), "paper": "https://artificialanalysis.ai/methodology/intelligence-benchmarking", }, { "name": "StructEval-T", "category": "GENERAL", "calc_weight": 0.5, "description": "Format-following", "summary": ( "StructEval-T evaluates structured output and format-following capabilities. " "Tests whether models can adhere to specified output templates, formatting " "constraints (JSON, XML, Markdown, tables), and structural requirements while " "maintaining content accuracy." ), "paper": None, }, { "name": "InverseIFEval", "category": "GENERAL", "calc_weight": 0.5, "description": "Unconventional instruction following", "summary": ( "Tests unconventional and counter-intuitive instruction following where models " "must comply with unusual or inverted constraints. Evaluates whether models can " "faithfully execute instructions that go against typical patterns, such as " "intentionally producing specific error formats or following negative constraints." ), "paper": None, }, { "name": "RAGTruth", "category": "GENERAL", "calc_weight": 0.5, "description": "Hallucinations relative to retrieved context", "summary": ( "RAGTruth benchmarks hallucination detection and prevention in RAG pipelines. " "Evaluates whether models faithfully ground responses in provided retrieved " "documents rather than generating unsupported claims. Covers summary-level and " "sentence-level faithfulness across diverse document types." ), "paper": "https://arxiv.org/abs/2401.00396", }, { "name": "SpatialText", "category": "GENERAL", "calc_weight": 0.5, "description": "Text-based spatial cognition", "summary": ( "SpatialText tests text-based spatial cognition — understanding spatial " "relationships, positions, orientations, and arrangements described purely through " "natural language. Critical for gaming scenarios involving navigation, object " "placement, and environmental descriptions." ), "paper": None, }, { "name": "SpartQA", "category": "GENERAL", "calc_weight": 0.5, "description": "Spatial reasoning with textual spatial descriptions and Q&A", "summary": ( "SpartQA is a question-answering benchmark for spatial reasoning from textual " "descriptions. Models must understand object positions, relative locations, " "containment, and spatial logic described in natural language passages, then answer " "questions requiring spatial inference. Includes FindRelation, FindBlock, and " "YesNo question types." ), "paper": "https://arxiv.org/abs/2104.05832", }, { "name": "COPA", "category": "GENERAL", "calc_weight": 0.5, "description": "Premise + choose plausible cause/effect", "summary": ( "Choice of Plausible Alternatives (COPA) is a classic commonsense causal reasoning " "benchmark. Given a premise, the model must select the more plausible cause or " "effect from two options. Tests open-domain commonsense causal reasoning critical " "for narrative coherence in game dialogue." ), "paper": "https://people.ict.usc.edu/~gordon/copa.html", }, { "name": "PIQA", "category": "GENERAL", "calc_weight": 0.5, "description": "Physical interaction common sense", "summary": ( "Physical Interaction Question Answering tests physical commonsense knowledge " "about everyday objects, their properties, affordances, and interactions. Each " "question presents a goal and two solutions; the model must select the physically " "plausible one. Relevant for game NPCs reasoning about physical world interactions." ), "paper": "https://arxiv.org/abs/1911.11641", }, ] CATEGORIES = ["ROLEPLAY", "ACTIONS", "GENERAL"] CATEGORY_WEIGHT = 1 / 3 # Each category contributes 33.3% CATEGORY_DISPLAY = { "ROLEPLAY": "Roleplay", "ACTIONS": "Actions", "GENERAL": "General", } def get_benchmarks_by_category(category: str) -> list[dict]: return [b for b in BENCHMARKS if b["category"] == category] def get_all_benchmark_names() -> list[str]: return [b["name"] for b in BENCHMARKS]