"""
GRM Evaluation Suite — Benchmark Registry

Each benchmark is defined with:
  - name: Display name
  - category: ROLEPLAY | ACTIONS | GENERAL
  - description: Short description of what the benchmark tests
  - calc_weight: 1.0 (core) or 0.5 (supplementary)
  - summary: Multi-sentence methodology summary for display
  - paper: URL to the paper or resource (if applicable)
"""

BENCHMARKS = [
    # ── ROLEPLAY (33%) ──────────────────────────────────────────────
    {
        "name": "MultiChallenge",
        "category": "ROLEPLAY",
        "calc_weight": 1.0,
        "description": "Multi-turn instruction following & coherence across turns",
        "summary": (
            "MultiChallenge evaluates multi-turn instruction following where models must "
            "maintain coherence across multiple challenging conversational turns. Each test "
            "scenario involves complex, multi-constraint instructions that require the model "
            "to track context, resolve references, and keep all prior commitments intact while "
            "handling new user requests."
        ),
        "paper": None,
    },
    {
        "name": "RoleBench",
        "category": "ROLEPLAY",
        "calc_weight": 1.0,
        "description": "Character-level roleplaying with explicit role framing",
        "summary": (
            "RoleBench is the first systematic and fine-grained character-level benchmark "
            "for role-playing, comprising 168,093 samples covering 100 distinct roles. "
            "Created via the RoleLLM framework using Context-Instruct for role-specific "
            "knowledge extraction and RoleGPT for speaking style imitation. Evaluates a "
            "model's ability to maintain character persona, domain knowledge, and consistent "
            "speaking style throughout interactions."
        ),
        "paper": "https://arxiv.org/abs/2310.00746",
    },
    {
        "name": "GRM-Bench \u2014 Coherence",
        "category": "ROLEPLAY",
        "calc_weight": 1.0,
        "description": "Logically sound and coherent across turns, without contradictions",
        "summary": (
            "Nvidia-authored benchmark testing resistance to incoherence in gaming dialogue. "
            "Scenarios are crafted to invoke common coherence failures, then a model's "
            "resilience is measured. Detection covers eight categories: factual/logical errors, "
            "cause-effect failures, self-contradiction, personality/background violations, "
            "role confusion, irrelevance, knowledge boundary violations, and false premise "
            "acceptance."
        ),
        "paper": None,
    },
    {
        "name": "GRM-Bench \u2014 Response Diversity",
        "category": "ROLEPLAY",
        "calc_weight": 1.0,
        "description": "Avoids using repetitive language and speech structure",
        "summary": (
            "Nvidia-authored benchmark measuring whether models avoid repetitive language "
            "patterns, vocabulary, and sentence structures across varied dialogue exchanges. "
            "Evaluates lexical diversity, syntactic variation, and stylistic range when a "
            "character is placed in different conversation contexts."
        ),
        "paper": None,
    },
    {
        "name": "GRM-Bench \u2014 Context Adaption",
        "category": "ROLEPLAY",
        "calc_weight": 1.0,
        "description": "Using latest knowledge/variable updates even if changed during convo",
        "summary": (
            "Nvidia-authored benchmark testing whether models correctly incorporate the latest "
            "game-state and knowledge updates, even when facts change mid-conversation. "
            "Scenarios involve dynamic variable mutations (e.g. inventory changes, NPC status "
            "updates) and verify the model references the current state rather than stale data."
        ),
        "paper": None,
    },
    {
        "name": "DialogueNLI",
        "category": "ROLEPLAY",
        "calc_weight": 0.5,
        "description": "Checks contradiction/consistency crumbling",
        "summary": (
            "Dialogue Natural Language Inference dataset for evaluating consistency in "
            "dialogue agents. Uses NLI-style classification (entailment / contradiction / "
            "neutral) to detect when a dialogue agent contradicts its established persona "
            "or previous statements. Derived from the Persona-Chat dataset with human-"
            "annotated sentence pairs."
        ),
        "paper": "https://arxiv.org/abs/1811.00671",
    },
    {
        "name": "RoleMRC",
        "category": "ROLEPLAY",
        "calc_weight": 0.5,
        "description": "Follow complex nested instructions while remaining in character",
        "summary": (
            "Role-based Machine Reading Comprehension benchmark that tests the ability to "
            "follow complex, nested instructions while remaining fully in character. Combines "
            "reading comprehension challenges with role-playing constraints, requiring models "
            "to extract and reason about information without breaking persona."
        ),
        "paper": None,
    },
    {
        "name": "EQBench v3",
        "category": "ROLEPLAY",
        "calc_weight": 0.5,
        "description": "Detect nuances in tone/intent and modulate response accordingly",
        "summary": (
            "Emotional Quotient Benchmark v3 assesses a model's emotional intelligence — "
            "specifically the ability to detect nuances in tone, intent, and emotional subtext, "
            "and to modulate responses accordingly. Tests include recognizing sarcasm, empathy "
            "calibration, emotional escalation/de-escalation, and context-appropriate tonal "
            "shifts."
        ),
        "paper": "https://eqbench.com/",
    },

    # ── ACTIONS (33%) ───────────────────────────────────────────────
    {
        "name": "BFCLv3",
        "category": "ACTIONS",
        "calc_weight": 1.0,
        "description": "Serial/parallel tool calling, multi-step settings",
        "summary": (
            "Berkeley Function-Calling Leaderboard v3 evaluates serial and parallel tool "
            "calling in multi-step settings across multiple programming languages and complex "
            "function schemas. Tests include simple, multiple, parallel, and nested function "
            "calls, as well as function relevance detection (knowing when no tool applies)."
        ),
        "paper": "https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html",
    },
    {
        "name": "Tau2-Bench",
        "category": "ACTIONS",
        "calc_weight": 1.0,
        "description": "Multi-turn interactions w/ real-world commercial operations",
        "summary": (
            "\u03c4\u00b2-Bench from Sierra Research is a multi-turn agentic benchmark using "
            "dual-control agent-user simulation for testing tool use in real-world commercial "
            "operations. The telecom domain contains 114 programmatically generated tasks "
            "with varying intents (service, mobile data, MMS). The outcome world-state "
            "determines success — e.g. whether Data is functioning after agent completion."
        ),
        "paper": "https://arxiv.org/abs/2506.07982",
    },
    {
        "name": "ToolSandbox",
        "category": "ACTIONS",
        "calc_weight": 1.0,
        "description": "Stateful dependencies + conversational tool calling",
        "summary": (
            "Apple's stateful, conversational, interactive evaluation benchmark for LLM tool "
            "use. Includes stateful tool execution with implicit state dependencies between "
            "tools, a built-in user simulator supporting on-policy conversational evaluation, "
            "and dynamic evaluation of intermediate and final milestones. Tests canonicalization, "
            "insufficient information handling, and complex state management."
        ),
        "paper": "https://arxiv.org/abs/2408.04682",
    },
    {
        "name": "When2Call",
        "category": "ACTIONS",
        "calc_weight": 1.0,
        "description": "Tool call timing \u2014 when to trigger, when to follow-up, etc.",
        "summary": (
            "Evaluates tool-call timing decisions: knowing when to invoke a tool, when to ask "
            "for clarification first, and when to provide a direct answer without tools. Tests "
            "the critical judgment of whether a function call is appropriate given the current "
            "conversational context and available information."
        ),
        "paper": None,
    },
    {
        "name": "GRM-Bench \u2014 Prompt Robustness",
        "category": "ACTIONS",
        "calc_weight": 1.0,
        "description": "Same prompt expressed differently still invoking intended tools",
        "summary": (
            "Nvidia-authored benchmark testing whether semantically equivalent prompts "
            "expressed in different phrasings, formality levels, and syntactic structures "
            "still correctly invoke the intended tools and actions. Measures robustness of "
            "tool-call intent recognition against natural language variation."
        ),
        "paper": None,
    },
    {
        "name": "BFCLv4",
        "category": "ACTIONS",
        "calc_weight": 0.5,
        "description": "Adds memory into the loop, and tests format sensitivity",
        "summary": (
            "Berkeley Function-Calling Leaderboard v4 extends v3 with memory-augmented tool "
            "calling scenarios and tests sensitivity to format variations in function schemas. "
            "Evaluates how well models handle evolving context windows and maintain tool-call "
            "accuracy when schema formats shift."
        ),
        "paper": "https://gorilla.cs.berkeley.edu/blogs/12_bfcl_v3_multi_turn.html",
    },
    {
        "name": "T-Eval",
        "category": "ACTIONS",
        "calc_weight": 0.5,
        "description": "Step-by-step tool use and orchestration, logical tool decomposition",
        "summary": (
            "T-Eval evaluates step-by-step tool use and orchestration capabilities. Tests "
            "logical decomposition of complex tasks into tool-calling sequences, including "
            "plan generation, tool selection, argument filling, and response summarization. "
            "Provides fine-grained analysis of where in the tool-use pipeline models fail."
        ),
        "paper": "https://arxiv.org/abs/2312.14033",
    },

    # ── GENERAL (33%) ──────────────────────────────────────────────
    {
        "name": "RULER",
        "category": "GENERAL",
        "calc_weight": 1.0,
        "description": "Needle-in-haystack + polluted state stress test",
        "summary": (
            "RULER (Real-world Understanding of Long-context and Evaluation through Retrieval) "
            "extends needle-in-a-haystack testing with multiple retrieval types, multi-hop "
            "composition, and aggregation tasks at varying context lengths. Includes polluted "
            "state and distractor injection to stress-test long-context faithfulness."
        ),
        "paper": "https://arxiv.org/abs/2404.06654",
    },
    {
        "name": "GaRAGe",
        "category": "GENERAL",
        "calc_weight": 1.0,
        "description": "Deflect/refuse action when state is insufficient/corrupted",
        "summary": (
            "GaRAGe (Grounded and Attributed RAG Evaluation) tests a model's ability to "
            "deflect or refuse action when the provided retrieval context is insufficient, "
            "corrupted, or contradictory. Evaluates robustness against adversarial or low-"
            "quality retrieved passages and the model's capacity to say 'I don't know' rather "
            "than hallucinate an answer."
        ),
        "paper": None,
    },
    {
        "name": "IFBench",
        "category": "GENERAL",
        "calc_weight": 0.5,
        "description": "Generic instruction following, not as prone to overfit vs IFEval",
        "summary": (
            "IFBench from AllenAI evaluates precise instruction following with 294 single-turn "
            "questions testing counting, formatting, and sentence manipulation. Uses a loose "
            "evaluation mode that accounts for extraneous text or formatting. Designed to be "
            "less prone to overfitting compared to IFEval."
        ),
        "paper": "https://arxiv.org/abs/2507.02833",
    },
    {
        "name": "AA LCR",
        "category": "GENERAL",
        "calc_weight": 0.5,
        "description": "Complex reasoning across long contexts",
        "summary": (
            "Artificial Analysis Long Context Reasoning benchmark with 100 hard text-based "
            "questions spanning 7 document categories (Company Reports, Industry Reports, "
            "Government Consultations, Academia, Legal, Marketing, Surveys). Requires ~100K "
            "tokens of input per question, demanding reasoning across multiple long documents."
        ),
        "paper": "https://artificialanalysis.ai/methodology/intelligence-benchmarking",
    },
    {
        "name": "StructEval-T",
        "category": "GENERAL",
        "calc_weight": 0.5,
        "description": "Format-following",
        "summary": (
            "StructEval-T evaluates structured output and format-following capabilities. "
            "Tests whether models can adhere to specified output templates, formatting "
            "constraints (JSON, XML, Markdown, tables), and structural requirements while "
            "maintaining content accuracy."
        ),
        "paper": None,
    },
    {
        "name": "InverseIFEval",
        "category": "GENERAL",
        "calc_weight": 0.5,
        "description": "Unconventional instruction following",
        "summary": (
            "Tests unconventional and counter-intuitive instruction following where models "
            "must comply with unusual or inverted constraints. Evaluates whether models can "
            "faithfully execute instructions that go against typical patterns, such as "
            "intentionally producing specific error formats or following negative constraints."
        ),
        "paper": None,
    },
    {
        "name": "RAGTruth",
        "category": "GENERAL",
        "calc_weight": 0.5,
        "description": "Hallucinations relative to retrieved context",
        "summary": (
            "RAGTruth benchmarks hallucination detection and prevention in RAG pipelines. "
            "Evaluates whether models faithfully ground responses in provided retrieved "
            "documents rather than generating unsupported claims. Covers summary-level and "
            "sentence-level faithfulness across diverse document types."
        ),
        "paper": "https://arxiv.org/abs/2401.00396",
    },
    {
        "name": "SpatialText",
        "category": "GENERAL",
        "calc_weight": 0.5,
        "description": "Text-based spatial cognition",
        "summary": (
            "SpatialText tests text-based spatial cognition — understanding spatial "
            "relationships, positions, orientations, and arrangements described purely through "
            "natural language. Critical for gaming scenarios involving navigation, object "
            "placement, and environmental descriptions."
        ),
        "paper": None,
    },
    {
        "name": "SpartQA",
        "category": "GENERAL",
        "calc_weight": 0.5,
        "description": "Spatial reasoning with textual spatial descriptions and Q&A",
        "summary": (
            "SpartQA is a question-answering benchmark for spatial reasoning from textual "
            "descriptions. Models must understand object positions, relative locations, "
            "containment, and spatial logic described in natural language passages, then answer "
            "questions requiring spatial inference. Includes FindRelation, FindBlock, and "
            "YesNo question types."
        ),
        "paper": "https://arxiv.org/abs/2104.05832",
    },
    {
        "name": "COPA",
        "category": "GENERAL",
        "calc_weight": 0.5,
        "description": "Premise + choose plausible cause/effect",
        "summary": (
            "Choice of Plausible Alternatives (COPA) is a classic commonsense causal reasoning "
            "benchmark. Given a premise, the model must select the more plausible cause or "
            "effect from two options. Tests open-domain commonsense causal reasoning critical "
            "for narrative coherence in game dialogue."
        ),
        "paper": "https://people.ict.usc.edu/~gordon/copa.html",
    },
    {
        "name": "PIQA",
        "category": "GENERAL",
        "calc_weight": 0.5,
        "description": "Physical interaction common sense",
        "summary": (
            "Physical Interaction Question Answering tests physical commonsense knowledge "
            "about everyday objects, their properties, affordances, and interactions. Each "
            "question presents a goal and two solutions; the model must select the physically "
            "plausible one. Relevant for game NPCs reasoning about physical world interactions."
        ),
        "paper": "https://arxiv.org/abs/1911.11641",
    },
]

CATEGORIES = ["ROLEPLAY", "ACTIONS", "GENERAL"]
CATEGORY_WEIGHT = 1 / 3  # Each category contributes 33.3%

CATEGORY_DISPLAY = {
    "ROLEPLAY": "Roleplay",
    "ACTIONS": "Actions",
    "GENERAL": "General",
}


def get_benchmarks_by_category(category: str) -> list[dict]:
    return [b for b in BENCHMARKS if b["category"] == category]


def get_all_benchmark_names() -> list[str]:
    return [b["name"] for b in BENCHMARKS]