Spaces:

nvidia
/

GRM

Running on CPU Upgrade

App Files Files Community

GRM / benchmarks.py

mbagdasarova-nvidia

Add GRM evaluation suite with scoring logic and benchmark registry

2a44234 2 months ago

raw

history blame

18.1 kB

	"""
	GRM Evaluation Suite — Benchmark Registry

	Each benchmark is defined with:
	- name: Display name
	- category: ROLEPLAY \| ACTIONS \| GENERAL
	- description: Short description of what the benchmark tests
	- calc_weight: 1.0 (core) or 0.5 (supplementary)
	- summary: Multi-sentence methodology summary for display
	- paper: URL to the paper or resource (if applicable)
	"""

	BENCHMARKS = [
	# ── ROLEPLAY (33%) ──────────────────────────────────────────────
	{
	"name": "MultiChallenge",
	"category": "ROLEPLAY",
	"calc_weight": 1.0,
	"description": "Multi-turn instruction following & coherence across turns",
	"summary": (
	"MultiChallenge evaluates multi-turn instruction following where models must "
	"maintain coherence across multiple challenging conversational turns. Each test "
	"scenario involves complex, multi-constraint instructions that require the model "
	"to track context, resolve references, and keep all prior commitments intact while "
	"handling new user requests."
	),
	"paper": None,
	},
	{
	"name": "RoleBench",
	"category": "ROLEPLAY",
	"calc_weight": 1.0,
	"description": "Character-level roleplaying with explicit role framing",
	"summary": (
	"RoleBench is the first systematic and fine-grained character-level benchmark "
	"for role-playing, comprising 168,093 samples covering 100 distinct roles. "
	"Created via the RoleLLM framework using Context-Instruct for role-specific "
	"knowledge extraction and RoleGPT for speaking style imitation. Evaluates a "
	"model's ability to maintain character persona, domain knowledge, and consistent "
	"speaking style throughout interactions."
	),
	"paper": "https://arxiv.org/abs/2310.00746",
	},
	{
	"name": "GRM-Bench \u2014 Coherence",
	"category": "ROLEPLAY",
	"calc_weight": 1.0,
	"description": "Logically sound and coherent across turns, without contradictions",
	"summary": (
	"Nvidia-authored benchmark testing resistance to incoherence in gaming dialogue. "
	"Scenarios are crafted to invoke common coherence failures, then a model's "
	"resilience is measured. Detection covers eight categories: factual/logical errors, "
	"cause-effect failures, self-contradiction, personality/background violations, "
	"role confusion, irrelevance, knowledge boundary violations, and false premise "
	"acceptance."
	),
	"paper": None,
	},
	{
	"name": "GRM-Bench \u2014 Response Diversity",
	"category": "ROLEPLAY",
	"calc_weight": 1.0,
	"description": "Avoids using repetitive language and speech structure",
	"summary": (
	"Nvidia-authored benchmark measuring whether models avoid repetitive language "
	"patterns, vocabulary, and sentence structures across varied dialogue exchanges. "
	"Evaluates lexical diversity, syntactic variation, and stylistic range when a "
	"character is placed in different conversation contexts."
	),
	"paper": None,
	},
	{
	"name": "GRM-Bench \u2014 Context Adaption",
	"category": "ROLEPLAY",
	"calc_weight": 1.0,
	"description": "Using latest knowledge/variable updates even if changed during convo",
	"summary": (
	"Nvidia-authored benchmark testing whether models correctly incorporate the latest "
	"game-state and knowledge updates, even when facts change mid-conversation. "
	"Scenarios involve dynamic variable mutations (e.g. inventory changes, NPC status "
	"updates) and verify the model references the current state rather than stale data."
	),
	"paper": None,
	},
	{
	"name": "DialogueNLI",
	"category": "ROLEPLAY",
	"calc_weight": 0.5,
	"description": "Checks contradiction/consistency crumbling",
	"summary": (
	"Dialogue Natural Language Inference dataset for evaluating consistency in "
	"dialogue agents. Uses NLI-style classification (entailment / contradiction / "
	"neutral) to detect when a dialogue agent contradicts its established persona "
	"or previous statements. Derived from the Persona-Chat dataset with human-"
	"annotated sentence pairs."
	),
	"paper": "https://arxiv.org/abs/1811.00671",
	},
	{
	"name": "RoleMRC",
	"category": "ROLEPLAY",
	"calc_weight": 0.5,
	"description": "Follow complex nested instructions while remaining in character",
	"summary": (
	"Role-based Machine Reading Comprehension benchmark that tests the ability to "
	"follow complex, nested instructions while remaining fully in character. Combines "
	"reading comprehension challenges with role-playing constraints, requiring models "
	"to extract and reason about information without breaking persona."
	),
	"paper": None,
	},
	{
	"name": "EQBench v3",
	"category": "ROLEPLAY",
	"calc_weight": 0.5,
	"description": "Detect nuances in tone/intent and modulate response accordingly",
	"summary": (
	"Emotional Quotient Benchmark v3 assesses a model's emotional intelligence — "
	"specifically the ability to detect nuances in tone, intent, and emotional subtext, "
	"and to modulate responses accordingly. Tests include recognizing sarcasm, empathy "
	"calibration, emotional escalation/de-escalation, and context-appropriate tonal "
	"shifts."
	),
	"paper": "https://eqbench.com/",
	},

	# ── ACTIONS (33%) ───────────────────────────────────────────────
	{
	"name": "BFCLv3",
	"category": "ACTIONS",
	"calc_weight": 1.0,
	"description": "Serial/parallel tool calling, multi-step settings",
	"summary": (
	"Berkeley Function-Calling Leaderboard v3 evaluates serial and parallel tool "
	"calling in multi-step settings across multiple programming languages and complex "
	"function schemas. Tests include simple, multiple, parallel, and nested function "
	"calls, as well as function relevance detection (knowing when no tool applies)."
	),
	"paper": "https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html",
	},
	{
	"name": "Tau2-Bench",
	"category": "ACTIONS",
	"calc_weight": 1.0,
	"description": "Multi-turn interactions w/ real-world commercial operations",
	"summary": (
	"\u03c4\u00b2-Bench from Sierra Research is a multi-turn agentic benchmark using "
	"dual-control agent-user simulation for testing tool use in real-world commercial "
	"operations. The telecom domain contains 114 programmatically generated tasks "
	"with varying intents (service, mobile data, MMS). The outcome world-state "
	"determines success — e.g. whether Data is functioning after agent completion."
	),
	"paper": "https://arxiv.org/abs/2506.07982",
	},
	{
	"name": "ToolSandbox",
	"category": "ACTIONS",
	"calc_weight": 1.0,
	"description": "Stateful dependencies + conversational tool calling",
	"summary": (
	"Apple's stateful, conversational, interactive evaluation benchmark for LLM tool "
	"use. Includes stateful tool execution with implicit state dependencies between "
	"tools, a built-in user simulator supporting on-policy conversational evaluation, "
	"and dynamic evaluation of intermediate and final milestones. Tests canonicalization, "
	"insufficient information handling, and complex state management."
	),
	"paper": "https://arxiv.org/abs/2408.04682",
	},
	{
	"name": "When2Call",
	"category": "ACTIONS",
	"calc_weight": 1.0,
	"description": "Tool call timing \u2014 when to trigger, when to follow-up, etc.",
	"summary": (
	"Evaluates tool-call timing decisions: knowing when to invoke a tool, when to ask "
	"for clarification first, and when to provide a direct answer without tools. Tests "
	"the critical judgment of whether a function call is appropriate given the current "
	"conversational context and available information."
	),
	"paper": None,
	},
	{
	"name": "GRM-Bench \u2014 Prompt Robustness",
	"category": "ACTIONS",
	"calc_weight": 1.0,
	"description": "Same prompt expressed differently still invoking intended tools",
	"summary": (
	"Nvidia-authored benchmark testing whether semantically equivalent prompts "
	"expressed in different phrasings, formality levels, and syntactic structures "
	"still correctly invoke the intended tools and actions. Measures robustness of "
	"tool-call intent recognition against natural language variation."
	),
	"paper": None,
	},
	{
	"name": "BFCLv4",
	"category": "ACTIONS",
	"calc_weight": 0.5,
	"description": "Adds memory into the loop, and tests format sensitivity",
	"summary": (
	"Berkeley Function-Calling Leaderboard v4 extends v3 with memory-augmented tool "
	"calling scenarios and tests sensitivity to format variations in function schemas. "
	"Evaluates how well models handle evolving context windows and maintain tool-call "
	"accuracy when schema formats shift."
	),
	"paper": "https://gorilla.cs.berkeley.edu/blogs/12_bfcl_v3_multi_turn.html",
	},
	{
	"name": "T-Eval",
	"category": "ACTIONS",
	"calc_weight": 0.5,
	"description": "Step-by-step tool use and orchestration, logical tool decomposition",
	"summary": (
	"T-Eval evaluates step-by-step tool use and orchestration capabilities. Tests "
	"logical decomposition of complex tasks into tool-calling sequences, including "
	"plan generation, tool selection, argument filling, and response summarization. "
	"Provides fine-grained analysis of where in the tool-use pipeline models fail."
	),
	"paper": "https://arxiv.org/abs/2312.14033",
	},

	# ── GENERAL (33%) ──────────────────────────────────────────────
	{
	"name": "RULER",
	"category": "GENERAL",
	"calc_weight": 1.0,
	"description": "Needle-in-haystack + polluted state stress test",
	"summary": (
	"RULER (Real-world Understanding of Long-context and Evaluation through Retrieval) "
	"extends needle-in-a-haystack testing with multiple retrieval types, multi-hop "
	"composition, and aggregation tasks at varying context lengths. Includes polluted "
	"state and distractor injection to stress-test long-context faithfulness."
	),
	"paper": "https://arxiv.org/abs/2404.06654",
	},
	{
	"name": "GaRAGe",
	"category": "GENERAL",
	"calc_weight": 1.0,
	"description": "Deflect/refuse action when state is insufficient/corrupted",
	"summary": (
	"GaRAGe (Grounded and Attributed RAG Evaluation) tests a model's ability to "
	"deflect or refuse action when the provided retrieval context is insufficient, "
	"corrupted, or contradictory. Evaluates robustness against adversarial or low-"
	"quality retrieved passages and the model's capacity to say 'I don't know' rather "
	"than hallucinate an answer."
	),
	"paper": None,
	},
	{
	"name": "IFBench",
	"category": "GENERAL",
	"calc_weight": 0.5,
	"description": "Generic instruction following, not as prone to overfit vs IFEval",
	"summary": (
	"IFBench from AllenAI evaluates precise instruction following with 294 single-turn "
	"questions testing counting, formatting, and sentence manipulation. Uses a loose "
	"evaluation mode that accounts for extraneous text or formatting. Designed to be "
	"less prone to overfitting compared to IFEval."
	),
	"paper": "https://arxiv.org/abs/2507.02833",
	},
	{
	"name": "AA LCR",
	"category": "GENERAL",
	"calc_weight": 0.5,
	"description": "Complex reasoning across long contexts",
	"summary": (
	"Artificial Analysis Long Context Reasoning benchmark with 100 hard text-based "
	"questions spanning 7 document categories (Company Reports, Industry Reports, "
	"Government Consultations, Academia, Legal, Marketing, Surveys). Requires ~100K "
	"tokens of input per question, demanding reasoning across multiple long documents."
	),
	"paper": "https://artificialanalysis.ai/methodology/intelligence-benchmarking",
	},
	{
	"name": "StructEval-T",
	"category": "GENERAL",
	"calc_weight": 0.5,
	"description": "Format-following",
	"summary": (
	"StructEval-T evaluates structured output and format-following capabilities. "
	"Tests whether models can adhere to specified output templates, formatting "
	"constraints (JSON, XML, Markdown, tables), and structural requirements while "
	"maintaining content accuracy."
	),
	"paper": None,
	},
	{
	"name": "InverseIFEval",
	"category": "GENERAL",
	"calc_weight": 0.5,
	"description": "Unconventional instruction following",
	"summary": (
	"Tests unconventional and counter-intuitive instruction following where models "
	"must comply with unusual or inverted constraints. Evaluates whether models can "
	"faithfully execute instructions that go against typical patterns, such as "
	"intentionally producing specific error formats or following negative constraints."
	),
	"paper": None,
	},
	{
	"name": "RAGTruth",
	"category": "GENERAL",
	"calc_weight": 0.5,
	"description": "Hallucinations relative to retrieved context",
	"summary": (
	"RAGTruth benchmarks hallucination detection and prevention in RAG pipelines. "
	"Evaluates whether models faithfully ground responses in provided retrieved "
	"documents rather than generating unsupported claims. Covers summary-level and "
	"sentence-level faithfulness across diverse document types."
	),
	"paper": "https://arxiv.org/abs/2401.00396",
	},
	{
	"name": "SpatialText",
	"category": "GENERAL",
	"calc_weight": 0.5,
	"description": "Text-based spatial cognition",
	"summary": (
	"SpatialText tests text-based spatial cognition — understanding spatial "
	"relationships, positions, orientations, and arrangements described purely through "
	"natural language. Critical for gaming scenarios involving navigation, object "
	"placement, and environmental descriptions."
	),
	"paper": None,
	},
	{
	"name": "SpartQA",
	"category": "GENERAL",
	"calc_weight": 0.5,
	"description": "Spatial reasoning with textual spatial descriptions and Q&A",
	"summary": (
	"SpartQA is a question-answering benchmark for spatial reasoning from textual "
	"descriptions. Models must understand object positions, relative locations, "
	"containment, and spatial logic described in natural language passages, then answer "
	"questions requiring spatial inference. Includes FindRelation, FindBlock, and "
	"YesNo question types."
	),
	"paper": "https://arxiv.org/abs/2104.05832",
	},
	{
	"name": "COPA",
	"category": "GENERAL",
	"calc_weight": 0.5,
	"description": "Premise + choose plausible cause/effect",
	"summary": (
	"Choice of Plausible Alternatives (COPA) is a classic commonsense causal reasoning "
	"benchmark. Given a premise, the model must select the more plausible cause or "
	"effect from two options. Tests open-domain commonsense causal reasoning critical "
	"for narrative coherence in game dialogue."
	),
	"paper": "https://people.ict.usc.edu/~gordon/copa.html",
	},
	{
	"name": "PIQA",
	"category": "GENERAL",
	"calc_weight": 0.5,
	"description": "Physical interaction common sense",
	"summary": (
	"Physical Interaction Question Answering tests physical commonsense knowledge "
	"about everyday objects, their properties, affordances, and interactions. Each "
	"question presents a goal and two solutions; the model must select the physically "
	"plausible one. Relevant for game NPCs reasoning about physical world interactions."
	),
	"paper": "https://arxiv.org/abs/1911.11641",
	},
	]

	CATEGORIES = ["ROLEPLAY", "ACTIONS", "GENERAL"]
	CATEGORY_WEIGHT = 1 / 3 # Each category contributes 33.3%

	CATEGORY_DISPLAY = {
	"ROLEPLAY": "Roleplay",
	"ACTIONS": "Actions",
	"GENERAL": "General",
	}


	def get_benchmarks_by_category(category: str) -> list[dict]:
	return [b for b in BENCHMARKS if b["category"] == category]


	def get_all_benchmark_names() -> list[str]:
	return [b["name"] for b in BENCHMARKS]