Spaces:

genarena
/

leaderboard

Running

App Files Files Community

rhli commited on Feb 5

Commit

dad152b

verified ·

1 Parent(s): a722b84

[genarena deploy] Upload genarena package

Browse files

Files changed (20) hide show

genarena/deploy/__init__.py +5 -0
genarena/prompts/__init__.py +127 -0
genarena/prompts/mmrb2.py +369 -0
genarena/sync/__init__.py +105 -0
genarena/sync/auto_commit.py +114 -0
genarena/sync/deploy_ops.py +539 -0
genarena/sync/git_ops.py +418 -0
genarena/sync/hf_ops.py +887 -0
genarena/sync/init_ops.py +427 -0
genarena/sync/packer.py +584 -0
genarena/sync/submit.py +833 -0
genarena/validation/__init__.py +19 -0
genarena/validation/schema.py +323 -0
genarena/validation/validator.py +325 -0
genarena/visualize/__init__.py +14 -0
genarena/visualize/app.py +934 -0
genarena/visualize/data_loader.py +2331 -0
genarena/visualize/static/app.js +0 -0
genarena/visualize/static/style.css +4104 -0
genarena/visualize/templates/index.html +413 -0

genarena/deploy/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""GenArena deploy module for HuggingFace Spaces deployment."""
+from genarena.deploy.app import main
+__all__ = ["main"]

genarena/prompts/__init__.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""Prompt module loader and validator."""
+import importlib
+import importlib.util
+import os
+from types import ModuleType
+from typing import Optional
+# Required attributes for a valid prompt module
+REQUIRED_ATTRIBUTES = ["PROMPT_TEXT", "ALLOW_TIE", "build_prompt", "parse_response"]
+def load_prompt(name: str) -> ModuleType:
+    """
+    Load a prompt module by name.
+    First tries to load from the genarena.prompts package, then attempts
+    to load from a file path if the name looks like a path.
+    Args:
+        name: Prompt module name (e.g., 'mmrb2') or path to a .py file
+    Returns:
+        Loaded module
+    Raises:
+        ImportError: If module cannot be found
+        ValueError: If module is invalid
+    """
+    module = None
+    # Try loading from genarena.prompts package
+    try:
+        module = importlib.import_module(f"genarena.prompts.{name}")
+    except ImportError:
+        pass
+    # If not found and name looks like a path, try loading from file
+    if module is None and (name.endswith('.py') or os.path.sep in name):
+        if os.path.isfile(name):
+            spec = importlib.util.spec_from_file_location("custom_prompt", name)
+            if spec and spec.loader:
+                module = importlib.util.module_from_spec(spec)
+                spec.loader.exec_module(module)
+    if module is None:
+        raise ImportError(
+            f"Could not load prompt module '{name}'. "
+            f"Make sure it exists in genarena/prompts/ or provide a valid file path."
+        )
+    # Validate the module
+    if not validate_prompt(module):
+        missing = get_missing_attributes(module)
+        raise ValueError(
+            f"Invalid prompt module '{name}'. "
+            f"Missing required attributes: {missing}"
+        )
+    return module
+def validate_prompt(module: ModuleType) -> bool:
+    """
+    Validate that a module contains all required prompt attributes.
+    Required attributes:
+    - PROMPT_TEXT: str - The evaluation prompt text
+    - ALLOW_TIE: bool - Whether single-round ties are allowed
+    - build_prompt: callable - Function to build VLM messages
+    - parse_response: callable - Function to parse VLM response
+    Args:
+        module: Module to validate
+    Returns:
+        True if valid, False otherwise
+    """
+    for attr in REQUIRED_ATTRIBUTES:
+        if not hasattr(module, attr):
+            return False
+        # Check callable attributes
+        if attr in ("build_prompt", "parse_response"):
+            if not callable(getattr(module, attr)):
+                return False
+    return True
+def get_missing_attributes(module: ModuleType) -> list[str]:
+    """
+    Get list of missing required attributes from a module.
+    Args:
+        module: Module to check
+    Returns:
+        List of missing attribute names
+    """
+    missing = []
+    for attr in REQUIRED_ATTRIBUTES:
+        if not hasattr(module, attr):
+            missing.append(attr)
+        elif attr in ("build_prompt", "parse_response"):
+            if not callable(getattr(module, attr)):
+                missing.append(f"{attr} (not callable)")
+    return missing
+def list_available_prompts() -> list[str]:
+    """
+    List all available prompt modules in the prompts directory.
+    Returns:
+        List of prompt module names
+    """
+    prompts_dir = os.path.dirname(__file__)
+    available = []
+    for filename in os.listdir(prompts_dir):
+        if filename.endswith('.py') and not filename.startswith('_'):
+            name = filename[:-3]  # Remove .py extension
+            available.append(name)
+    return sorted(available)

genarena/prompts/mmrb2.py ADDED Viewed

	@@ -0,0 +1,369 @@

+"""MMRB2 prompt implementation for image editing evaluation.
+This module implements the MMRB2 evaluation prompt for pairwise comparison
+of image editing results. It uses a 1-6 scoring scale and does not allow
+ties in single rounds.
+Reference: MMRB2 evaluation framework
+"""
+import base64
+import io
+import re
+from typing import Any, Union
+import json_repair
+from PIL import Image as PILImage
+# Whether single-round ties are allowed (mmrb2 requires a winner)
+ALLOW_TIE = False
+# The full evaluation prompt text from get_image_edit_prompt()
+PROMPT_TEXT = """You are an expert in image editing quality analysis and AI evaluation. Your role is to act as an objective judge for comparing two AI-generated image editing responses to the same prompt. You will evaluate which response is better based on a comprehensive rubric specifically designed for image editing tasks.
+**Important Guidelines:**
+- Be completely impartial and avoid any position biases
+- Ensure that the order in which the responses were presented does not influence your decision
+- Do not allow the length of the responses to influence your evaluation
+- Do not favor certain model names or types
+- Be as objective as possible in your assessment
+- Focus on image editing specific factors: faithfulness to editing instructions, preservation of input image elements, and overall editing quality
+**Understanding the Content Structure:**
+- **[ORIGINAL PROMPT TO MODEL:]**: This is the image editing instruction given to both AI models
+- **[INPUT IMAGE FROM PROMPT:]**: This is the source image provided to both models for editing
+- **[RESPONSE A:]**: The first model's edited image response
+- **[RESPONSE B:]**: The second model's edited image response
+Your evaluation must be based on a fine-grained rubric that covers the following criteria. For each criterion, you must provide detailed step-by-step reasoning comparing both responses. You will use a 1-6 scoring scale.
+**Evaluation Criteria:**
+1. **text_faithfulness:** Which response better adheres to the text editing instruction? Consider how well each response follows the specific editing instructions (e.g., adding objects, changing colors, modifying scenes).
+2. **image_faithfulness:** Which response better respects and incorporates the key elements of the input image? Consider how well each response preserves important aspects of the original image (composition, lighting, style, background elements) while making the requested changes.
+3. **overall_image_quality:** Which response has better general technical and aesthetic quality, with fewer visual artifacts, distortions, or inconsistencies introduced during the editing process?
+4. **text_rendering:** If either response contains rendered text, which one has better text quality (spelling, legibility, integration with the image)? If no text is rendered, state "Not Applicable."
+**Scoring Rubric:**
+- Score 6 (A is significantly better): Response A is significantly superior across most criteria
+- Score 5 (A is marginally better): Response A is noticeably better across several criteria
+- Score 4 (Unsure or A is negligibly better): Response A is slightly better or roughly equivalent
+- Score 3 (Unsure or B is negligibly better): Response B is slightly better or roughly equivalent
+- Score 2 (B is marginally better): Response B is noticeably better across several criteria
+- Score 1 (B is significantly better): Response B is significantly superior across most criteria
+**Confidence Assessment:**
+After your evaluation, assess your confidence in this judgment on a scale of 0.0 to 1.0:
+**CRITICAL**: Be EXTREMELY conservative with confidence scores. Most comparisons should be in the 0.2-0.5 range.
+- **Very High Confidence (0.8-1.0)**: ONLY for absolutely obvious cases where one response is dramatically better across ALL criteria with zero ambiguity. Use this extremely rarely (less than 10% of cases).
+- **High Confidence (0.6-0.7)**: Clear differences but some uncertainty remains. Use sparingly (less than 20% of cases).
+- **Medium Confidence (0.4-0.5)**: Noticeable differences but significant uncertainty. This should be your DEFAULT range.
+- **Low Confidence (0.2-0.3)**: Very close comparison, difficult to distinguish. Responses are roughly equivalent or have conflicting strengths.
+- **Very Low Confidence (0.0-0.1)**: Essentially indistinguishable responses or major conflicting strengths.
+**IMPORTANT GUIDELINES**:
+- DEFAULT to 0.3-0.5 range for most comparisons
+- Only use 0.6+ when you are absolutely certain
+- Consider: Could reasonable people disagree on this comparison?
+- Consider: Are there any strengths in the "worse" response?
+- Consider: How obvious would this be to a human evaluator?
+- Remember: Quality assessment is inherently subjective
+After your reasoning, you will provide a final numerical score, indicate which response is better, and assess your confidence. You must always output your response in the following structured JSON format:
+{
+    "reasoning": {
+        "text_faithfulness": "YOUR REASONING HERE",
+        "image_faithfulness": "YOUR REASONING HERE",
+        "overall_image_quality": "YOUR REASONING HERE",
+        "text_rendering": "YOUR REASONING HERE",
+        "comparison_summary": "YOUR OVERALL COMPARISON SUMMARY HERE"
+    },
+    "score": <int 1-6>,
+    "better_response": "A" or "B",
+    "confidence": <float 0.0-1.0>,
+    "confidence_rationale": "YOUR CONFIDENCE ASSESSMENT REASONING HERE"
+}"""
+def _encode_image_to_base64(image_source: Union[str, bytes, PILImage.Image, io.BytesIO, dict[str, Any]]) -> str:
+    """
+    Encode an image to base64.
+    Args:
+        image_source: Either a file path (str), raw bytes, PIL.Image object, or BytesIO
+    Returns:
+        Base64 encoded string
+    Raises:
+        TypeError: If image_source type is not supported
+        ValueError: If image_source cannot be converted to bytes
+    """
+    image_bytes: bytes
+    if isinstance(image_source, str):
+        # It's a file path
+        with open(image_source, "rb") as f:
+            image_bytes = f.read()
+    elif isinstance(image_source, io.BytesIO):
+        # It's a BytesIO object
+        image_source.seek(0)
+        image_bytes = image_source.read()
+    elif isinstance(image_source, PILImage.Image):
+        # It's a PIL Image object (e.g., from HuggingFace datasets)
+        buffer = io.BytesIO()
+        image_source.save(buffer, format="PNG")
+        image_bytes = buffer.getvalue()
+    elif isinstance(image_source, dict):
+        # It's a dict (e.g., from HuggingFace datasets Image() type)
+        if "bytes" in image_source:
+            raw = image_source["bytes"]
+            if isinstance(raw, bytes):
+                image_bytes = raw
+            elif isinstance(raw, io.BytesIO):
+                raw.seek(0)
+                image_bytes = raw.read()
+            else:
+                # Recurse to handle nested types
+                return _encode_image_to_base64(raw)
+        elif "path" in image_source and image_source["path"]:
+            with open(image_source["path"], "rb") as f:
+                image_bytes = f.read()
+        else:
+            raise ValueError(f"Cannot extract image from dict: {image_source.keys()}")
+    elif isinstance(image_source, bytes):
+        # It's already bytes - MUST check after more specific types
+        image_bytes = image_source
+    else:
+        # Unknown type - raise error with helpful message
+        raise TypeError(
+            f"Unsupported image type: {type(image_source).__name__}. "
+            f"Expected str (path), bytes, PIL.Image, io.BytesIO, or dict. "
+            f"Got: {repr(image_source)[:200]}"
+        )
+    # Verify we have valid bytes before encoding
+    if not isinstance(image_bytes, bytes):
+        raise ValueError(
+            f"Failed to convert image to bytes. "
+            f"Got {type(image_bytes).__name__} instead. "
+            f"Original input was {type(image_source).__name__}"
+        )
+    return base64.b64encode(image_bytes).decode("utf-8")
+def _get_image_media_type(image_source: Union[str, bytes, PILImage.Image]) -> str:
+    """
+    Determine the media type of an image.
+    Args:
+        image_source: Either a file path (str), raw bytes, or PIL.Image object
+    Returns:
+        Media type string (e.g., 'image/png')
+    """
+    if isinstance(image_source, str):
+        ext = image_source.lower().split('.')[-1]
+        media_types = {
+            'png': 'image/png',
+            'jpg': 'image/jpeg',
+            'jpeg': 'image/jpeg',
+            'webp': 'image/webp',
+            'gif': 'image/gif',
+        }
+        return media_types.get(ext, 'image/png')
+    elif isinstance(image_source, PILImage.Image):
+        # For PIL.Image, we convert to PNG
+        return 'image/png'
+    else:
+        # Try to detect from bytes magic
+        if image_source[:8] == b'\x89PNG\r\n\x1a\n':
+            return 'image/png'
+        elif image_source[:2] == b'\xff\xd8':
+            return 'image/jpeg'
+        elif image_source[:4] == b'RIFF' and image_source[8:12] == b'WEBP':
+            return 'image/webp'
+        else:
+            return 'image/png'
+def _create_image_content(image_source: Union[str, bytes]) -> dict[str, Any]:
+    """
+    Create an image content block for OpenAI API.
+    Args:
+        image_source: Either a file path (str) or raw bytes
+    Returns:
+        Image content dict for OpenAI API
+    """
+    base64_data = _encode_image_to_base64(image_source)
+    media_type = _get_image_media_type(image_source)
+    return {
+        "type": "image_url",
+        "image_url": {
+            "url": f"data:{media_type};base64,{base64_data}"
+        }
+    }
+def build_prompt(
+    instruction: str,
+    input_images: list[Union[str, bytes]],
+    output_image_a: Union[str, bytes],
+    output_image_b: Union[str, bytes]
+) -> list[dict[str, Any]]:
+    """
+    Build the VLM prompt messages for pairwise evaluation.
+    Constructs messages in the format:
+    [EVALUATION PROMPT TEXT]
+    [ORIGINAL PROMPT TO MODEL:]
+    {instruction and input_images}
+    [RESPONSE A:]
+    {output_image_a}
+    [RESPONSE B:]
+    {output_image_b}
+    Args:
+        instruction: The editing instruction given to models
+        input_images: List of input images (file paths or bytes)
+        output_image_a: Output from model A (file path or bytes)
+        output_image_b: Output from model B (file path or bytes)
+    Returns:
+        List of message dicts for OpenAI Chat Completion API
+    """
+    # Build content list
+    content = []
+    # 1. Evaluation prompt
+    content.append({
+        "type": "text",
+        "text": PROMPT_TEXT
+    })
+    # 2. Original prompt to model section
+    content.append({
+        "type": "text",
+        "text": "[ORIGINAL PROMPT TO MODEL:]"
+    })
+    # Add instruction text
+    content.append({
+        "type": "text",
+        "text": instruction
+    })
+    # Add input images if any
+    if input_images:
+        content.append({
+            "type": "text",
+            "text": "[INPUT IMAGE FROM PROMPT:]"
+        })
+        for img in input_images:
+            content.append(_create_image_content(img))
+    # 3. Response A
+    content.append({
+        "type": "text",
+        "text": "[RESPONSE A:]"
+    })
+    content.append(_create_image_content(output_image_a))
+    # 4. Response B
+    content.append({
+        "type": "text",
+        "text": "[RESPONSE B:]"
+    })
+    content.append(_create_image_content(output_image_b))
+    # Return as OpenAI API format
+    return [
+        {
+            "role": "user",
+            "content": content
+        }
+    ]
+def parse_response(response: str) -> dict[str, Any]:
+    """
+    Parse the VLM judge response.
+    Extracts structured information from VLM's JSON response,
+    handling markdown code blocks and minor JSON errors.
+    Args:
+        response: Raw response text from VLM
+    Returns:
+        Dict containing:
+        - winner: "A" or "B" (from better_response field)
+        - score: int 1-6
+        - confidence: float 0.0-1.0
+        - reasoning: dict with evaluation criteria
+        - raw_response: the original parsed JSON
+    Raises:
+        ValueError: If response cannot be parsed
+    """
+    # Remove markdown code block formatting
+    text = response.strip()
+    text = re.sub(r"^```(?:json)?\s*\n?", "", text)
+    text = re.sub(r"\n?```\s*$", "", text)
+    # Try to parse JSON with json_repair for fault tolerance
+    try:
+        parsed = json_repair.loads(text)
+    except Exception as e:
+        raise ValueError(f"Failed to parse JSON response: {e}\nResponse was:\n{response}")
+    # Extract fields
+    better_response = parsed.get("better_response", "")
+    # Normalize winner to uppercase
+    if isinstance(better_response, str):
+        winner = better_response.upper().strip()
+        if winner not in ("A", "B"):
+            # Try to extract from text
+            if "A" in winner:
+                winner = "A"
+            elif "B" in winner:
+                winner = "B"
+            else:
+                raise ValueError(f"Invalid better_response value: {better_response}")
+    else:
+        raise ValueError(f"better_response must be a string, got: {type(better_response)}")
+    # Extract score (1-6)
+    score = parsed.get("score", 4)
+    if isinstance(score, str):
+        score = int(score)
+    score = max(1, min(6, score))
+    # Extract confidence (0.0-1.0)
+    confidence = parsed.get("confidence", 0.5)
+    if isinstance(confidence, str):
+        confidence = float(confidence)
+    confidence = max(0.0, min(1.0, confidence))
+    # Extract reasoning
+    reasoning = parsed.get("reasoning", {})
+    return {
+        "winner": winner,
+        "score": score,
+        "confidence": confidence,
+        "reasoning": reasoning,
+        "raw_response": parsed
+    }

genarena/sync/__init__.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+Sync module for GenArena.
+This module provides Git version control and Huggingface synchronization
+capabilities for arena data.
+"""
+from genarena.sync.git_ops import (
+    is_git_initialized,
+    git_init,
+    ensure_gitignore,
+    git_add_all,
+    git_commit,
+    has_uncommitted_changes,
+    git_remote_add,
+    git_remote_get_url,
+    git_push,
+    git_sync,
+)
+from genarena.sync.auto_commit import (
+    auto_commit_and_push,
+    with_auto_commit,
+)
+from genarena.sync.hf_ops import (
+    get_hf_token,
+    require_hf_token,
+    validate_dataset_repo,
+    list_repo_files,
+    get_repo_file_info,
+    upload_file,
+    upload_files_batch,
+    download_file,
+    check_file_exists,
+    upload_arena_data,
+    pull_arena_data,
+    list_repo_contents,
+)
+from genarena.sync.packer import (
+    pack_model_dir,
+    pack_exp_dir,
+    unpack_zip,
+    collect_upload_tasks,
+    collect_download_tasks,
+    TempPackingContext,
+    TaskType,
+    PackTask,
+    UnpackTask,
+)
+from genarena.sync.init_ops import (
+    DEFAULT_BENCHMARK_REPO,
+    DEFAULT_ARENA_REPO,
+    discover_repo_subsets,
+    download_benchmark_data,
+    init_arena,
+)
+__all__ = [
+    # Git operations
+    "is_git_initialized",
+    "git_init",
+    "ensure_gitignore",
+    "git_add_all",
+    "git_commit",
+    "has_uncommitted_changes",
+    "git_remote_add",
+    "git_remote_get_url",
+    "git_push",
+    "git_sync",
+    # Auto commit
+    "auto_commit_and_push",
+    "with_auto_commit",
+    # Huggingface operations
+    "get_hf_token",
+    "require_hf_token",
+    "validate_dataset_repo",
+    "list_repo_files",
+    "get_repo_file_info",
+    "upload_file",
+    "upload_files_batch",
+    "download_file",
+    "check_file_exists",
+    "upload_arena_data",
+    "pull_arena_data",
+    "list_repo_contents",
+    # Packer utilities
+    "pack_model_dir",
+    "pack_exp_dir",
+    "unpack_zip",
+    "collect_upload_tasks",
+    "collect_download_tasks",
+    "TempPackingContext",
+    "TaskType",
+    "PackTask",
+    "UnpackTask",
+    # Init operations
+    "DEFAULT_BENCHMARK_REPO",
+    "DEFAULT_ARENA_REPO",
+    "discover_repo_subsets",
+    "download_benchmark_data",
+    "init_arena",
+]

genarena/sync/auto_commit.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+Auto commit module for GenArena.
+This module provides automatic commit and push functionality
+that is triggered after command execution.
+"""
+import logging
+from typing import Callable, TypeVar
+from genarena.sync.git_ops import (
+    is_git_initialized,
+    has_uncommitted_changes,
+    git_commit,
+    git_push,
+    git_remote_get_url,
+)
+logger = logging.getLogger(__name__)
+# Type variable for generic decorator
+T = TypeVar("T")
+def auto_commit_and_push(arena_dir: str, command_name: str) -> None:
+    """
+    Automatically commit and push changes after a command execution.
+    This function is designed to be called after commands that modify
+    arena_dir content (e.g., run, merge, delete). It silently skips
+    if Git is not initialized, and only warns on failure without
+    interrupting the main command flow.
+    Args:
+        arena_dir: Path to the arena directory
+        command_name: Name of the command that triggered this auto-commit
+    """
+    # Skip silently if Git is not initialized
+    if not is_git_initialized(arena_dir):
+        return
+    # Check if there are uncommitted changes
+    if not has_uncommitted_changes(arena_dir):
+        logger.debug(f"No changes to commit after {command_name}")
+        return
+    # Try to commit
+    try:
+        success, msg = git_commit(arena_dir, command_name=command_name)
+        if success:
+            if "Nothing to commit" not in msg:
+                logger.info(f"Auto-committed changes: {msg}")
+        else:
+            logger.warning(f"Auto-commit failed: {msg}")
+            return
+    except Exception as e:
+        logger.warning(f"Auto-commit failed with exception: {e}")
+        return
+    # Check if remote is configured and try to push
+    remote_url = git_remote_get_url(arena_dir)
+    if not remote_url:
+        logger.debug("No remote configured, skipping auto-push")
+        return
+    # Try to push
+    try:
+        success, msg = git_push(arena_dir)
+        if success:
+            logger.info(f"Auto-pushed changes: {msg}")
+        else:
+            logger.warning(f"Auto-push failed: {msg}")
+    except Exception as e:
+        logger.warning(f"Auto-push failed with exception: {e}")
+def with_auto_commit(command_name: str):
+    """
+    Decorator that adds auto-commit functionality to command functions.
+    The decorated function must have 'arena_dir' as an argument or
+    in its args namespace.
+    Args:
+        command_name: Name of the command for commit message
+    Returns:
+        Decorator function
+    """
+    def decorator(func: Callable[..., int]) -> Callable[..., int]:
+        def wrapper(*args, **kwargs) -> int:
+            # Execute the original command
+            result = func(*args, **kwargs)
+            # Only auto-commit if the command succeeded (return code 0)
+            if result == 0:
+                # Try to get arena_dir from args
+                arena_dir = None
+                # Check kwargs first
+                if "arena_dir" in kwargs:
+                    arena_dir = kwargs["arena_dir"]
+                # Check if first arg is argparse.Namespace
+                elif args and hasattr(args[0], "arena_dir"):
+                    arena_dir = args[0].arena_dir
+                if arena_dir:
+                    auto_commit_and_push(arena_dir, command_name)
+            return result
+        return wrapper
+    return decorator

genarena/sync/deploy_ops.py ADDED Viewed

	@@ -0,0 +1,539 @@

+"""
+Deploy operations for GenArena.
+Handles uploading arena data to HuggingFace for Spaces deployment.
+Unlike `hf upload`, this uploads images directly (not as ZIP) for CDN access.
+Parquet benchmark data is downloaded from rhli/genarena during Docker build.
+"""
+import logging
+import os
+from multiprocessing import Pool
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Default multiprocessing settings
+DEFAULT_NUM_WORKERS = 16
+DEFAULT_WORKER_TIMEOUT = 300  # seconds
+def upload_for_deploy(
+    arena_dir: str,
+    arena_repo: str,
+    space_repo: str,
+    subsets: Optional[list[str]] = None,
+    overwrite: bool = False,
+    show_progress: bool = True,
+    max_retries: int = 3,
+    num_workers: int = DEFAULT_NUM_WORKERS,
+    worker_timeout: int = DEFAULT_WORKER_TIMEOUT,
+) -> tuple[bool, str]:
+    """
+    Upload all data needed for HuggingFace Spaces deployment.
+    This uploads:
+    1. Arena data (pk_logs, models, state.json) to arena_repo (Dataset)
+       - Images are uploaded directly (not as ZIP) for CDN access
+       - Follows symlinks to upload actual image files
+    2. Deploy files (Dockerfile, app.py, README.md) to space_repo
+    Note: Parquet benchmark data is NOT uploaded. It is downloaded from
+    rhli/genarena during Docker build in the Space.
+    Args:
+        arena_dir: Local arena directory
+        arena_repo: HF Dataset repo for arena data
+        space_repo: HF Space repo for deployment
+        subsets: Subsets to upload (None = all)
+        overwrite: Overwrite existing files
+        show_progress: Show progress bar
+        max_retries: Max retries per file
+        num_workers: Number of parallel workers for upload (default: 16)
+        worker_timeout: Timeout in seconds for each worker (default: 300)
+    Returns:
+        Tuple of (success, message)
+    """
+    from genarena.sync.hf_ops import (
+        require_hf_token,
+        validate_dataset_repo,
+    )
+    # Get token
+    try:
+        token = require_hf_token()
+    except ValueError as e:
+        return False, str(e)
+    messages = []
+    # 1. Upload arena data to Dataset repo (images as individual files, not ZIP)
+    logger.info(f"Uploading arena data to {arena_repo}...")
+    valid, msg = validate_dataset_repo(arena_repo, token)
+    if not valid:
+        return False, f"Arena repo validation failed: {msg}"
+    success, msg = upload_arena_data_for_cdn(
+        arena_dir=arena_dir,
+        repo_id=arena_repo,
+        subsets=subsets,
+        overwrite=overwrite,
+        show_progress=show_progress,
+        token=token,
+        num_workers=num_workers,
+        worker_timeout=worker_timeout,
+    )
+    if not success:
+        return False, f"Arena upload failed: {msg}"
+    messages.append(f"Arena data: {msg}")
+    # 2. Upload deploy files to Space repo
+    logger.info(f"Uploading deploy files to {space_repo}...")
+    success, msg = upload_deploy_files(
+        space_repo=space_repo,
+        overwrite=overwrite,
+        token=token,
+    )
+    if not success:
+        return False, f"Deploy files upload failed: {msg}"
+    messages.append(f"Deploy files: {msg}")
+    return True, "\n".join(messages)
+def collect_files_follow_symlinks(
+    base_dir: str,
+    path_prefix: str = "",
+) -> list[tuple[str, str]]:
+    """
+    Collect all files under base_dir, following symlinks.
+    Args:
+        base_dir: Directory to scan
+        path_prefix: Prefix for remote paths
+    Returns:
+        List of (local_path, remote_path) tuples
+    """
+    files = []
+    if not os.path.isdir(base_dir):
+        return files
+    # Use os.walk with followlinks=True to traverse symlinks
+    for root, dirs, filenames in os.walk(base_dir, followlinks=True):
+        # Skip hidden directories and special directories
+        dirs[:] = [d for d in dirs if not d.startswith(".") and d != "__pycache__" and d != "raw_outputs"]
+        rel_root = os.path.relpath(root, base_dir)
+        if rel_root == ".":
+            rel_root = ""
+        for filename in filenames:
+            if filename.startswith("."):
+                continue
+            local_path = os.path.join(root, filename)
+            # Build remote path
+            if rel_root:
+                remote_path = f"{path_prefix}/{rel_root}/{filename}" if path_prefix else f"{rel_root}/{filename}"
+            else:
+                remote_path = f"{path_prefix}/{filename}" if path_prefix else filename
+            # Normalize path separators
+            remote_path = remote_path.replace("\\", "/")
+            files.append((local_path, remote_path))
+    return files
+def _upload_batch_worker(args: tuple) -> tuple[int, int]:
+    """
+    Worker function for uploading a single batch.
+    Args:
+        args: Tuple of (batch_index, batch, repo_id, token, total_batches, max_retries)
+    Returns:
+        Tuple of (uploaded_count, failed_count)
+    """
+    from huggingface_hub import HfApi, CommitOperationAdd
+    batch_index, batch, repo_id, token, total_batches, max_retries = args
+    api = HfApi(token=token)
+    operations = []
+    failed_read = 0
+    for local_path, remote_path in batch:
+        try:
+            operations.append(
+                CommitOperationAdd(
+                    path_in_repo=remote_path,
+                    path_or_fileobj=local_path,
+                )
+            )
+        except Exception as e:
+            logger.warning(f"Failed to read {local_path}: {e}")
+            failed_read += 1
+    if not operations:
+        return 0, failed_read
+    # Try to commit batch with retries
+    for attempt in range(max_retries):
+        try:
+            api.create_commit(
+                repo_id=repo_id,
+                repo_type="dataset",
+                operations=operations,
+                commit_message=f"[genarena deploy] Upload batch {batch_index + 1}/{total_batches}",
+            )
+            return len(operations), failed_read
+        except Exception as e:
+            if attempt < max_retries - 1:
+                logger.warning(f"Batch {batch_index + 1} failed (attempt {attempt + 1}), retrying: {e}")
+            else:
+                logger.error(f"Batch {batch_index + 1} failed after {max_retries} attempts: {e}")
+                return 0, len(operations) + failed_read
+    return 0, len(operations) + failed_read
+def upload_arena_data_for_cdn(
+    arena_dir: str,
+    repo_id: str,
+    subsets: Optional[list[str]] = None,
+    overwrite: bool = False,
+    show_progress: bool = True,
+    token: Optional[str] = None,
+    num_workers: int = DEFAULT_NUM_WORKERS,
+    worker_timeout: int = DEFAULT_WORKER_TIMEOUT,
+) -> tuple[bool, str]:
+    """
+    Upload arena data with images as individual files (not ZIP) for CDN access.
+    This function follows symlinks to upload actual image files.
+    Models directory often contains symlinks to external image directories.
+    Directory structure uploaded:
+        {subset}/models/{exp_name}/{model}/{index}.png  (individual images)
+        {subset}/pk_logs/{exp_name}/*.jsonl             (battle logs)
+        {subset}/arena/state.json                       (ELO state)
+    Args:
+        arena_dir: Path to the arena directory
+        repo_id: HuggingFace repository ID
+        subsets: List of subsets to upload (None = all)
+        overwrite: If True, overwrite existing files
+        show_progress: If True, show progress bar
+        token: HuggingFace token
+        num_workers: Number of parallel workers for upload (default: 16)
+        worker_timeout: Timeout in seconds for each worker (default: 300)
+    Returns:
+        Tuple of (success, message)
+    """
+    from huggingface_hub import HfApi
+    if token is None:
+        from genarena.sync.hf_ops import require_hf_token
+        token = require_hf_token()
+    api = HfApi(token=token)
+    # Validate arena directory
+    if not os.path.isdir(arena_dir):
+        return False, f"Arena directory not found: {arena_dir}"
+    # Discover subsets
+    available_subsets = [
+        d for d in os.listdir(arena_dir)
+        if os.path.isdir(os.path.join(arena_dir, d)) and not d.startswith(".")
+    ]
+    if subsets:
+        target_subsets = [s for s in subsets if s in available_subsets]
+    else:
+        target_subsets = available_subsets
+    if not target_subsets:
+        return False, "No subsets found to upload"
+    logger.info(f"Target subsets: {target_subsets}")
+    # Collect all files to upload (following symlinks)
+    all_files: list[tuple[str, str]] = []
+    for subset in target_subsets:
+        subset_dir = os.path.join(arena_dir, subset)
+        logger.info(f"Scanning subset: {subset}")
+        # Collect files from models/, pk_logs/, arena/
+        for subdir in ["models", "pk_logs", "arena"]:
+            subdir_path = os.path.join(subset_dir, subdir)
+            if os.path.isdir(subdir_path):
+                files = collect_files_follow_symlinks(subdir_path, f"{subset}/{subdir}")
+                all_files.extend(files)
+                logger.info(f"  {subdir}: {len(files)} files")
+    if not all_files:
+        return False, "No files found to upload"
+    logger.info(f"Total files to upload: {len(all_files)}")
+    # Filter by extension (only upload relevant files)
+    valid_extensions = {".png", ".jpg", ".jpeg", ".webp", ".json", ".jsonl"}
+    all_files = [
+        (local, remote) for local, remote in all_files
+        if os.path.splitext(local)[1].lower() in valid_extensions
+    ]
+    logger.info(f"Files after extension filtering: {len(all_files)}")
+    # Filter out files in subdirectories under models/<exp>/<model>/
+    # Expected structure: {subset}/models/{exp}/{model}/{file}
+    # Files deeper than this (e.g., {subset}/models/{exp}/{model}/subfolder/{file}) should be skipped
+    def is_valid_model_path(remote: str) -> bool:
+        parts = remote.split("/")
+        # Non-models paths are always valid
+        if len(parts) < 2 or parts[1] != "models":
+            return True
+        # For models paths, expect exactly: subset/models/exp/model/file (5 parts)
+        return len(parts) == 5
+    before_depth_filter = len(all_files)
+    all_files = [(local, remote) for local, remote in all_files if is_valid_model_path(remote)]
+    depth_filtered = before_depth_filter - len(all_files)
+    if depth_filtered > 0:
+        logger.info(f"Skipped {depth_filtered} files in model subdirectories")
+    logger.info(f"Files after filtering: {len(all_files)}")
+    # Get existing files in repo (for skip check)
+    existing_files: set[str] = set()
+    if not overwrite:
+        try:
+            existing_files = set(api.list_repo_files(repo_id=repo_id, repo_type="dataset"))
+            logger.info(f"Existing files in repo: {len(existing_files)}")
+        except Exception:
+            pass
+    # Filter out existing files
+    if not overwrite:
+        original_count = len(all_files)
+        all_files = [
+            (local, remote) for local, remote in all_files
+            if remote not in existing_files
+        ]
+        skipped = original_count - len(all_files)
+        logger.info(f"Skipping {skipped} existing files, {len(all_files)} to upload")
+    else:
+        skipped = 0
+    if not all_files:
+        return True, f"All files already exist. Skipped {skipped} files."
+    # Upload in batches using create_commit with multiprocessing
+    batch_size = 500  # HuggingFace recommends smaller batches for large files
+    max_retries = 3
+    # Create batches
+    batches = []
+    for i in range(0, len(all_files), batch_size):
+        batch = all_files[i:i + batch_size]
+        batches.append(batch)
+    total_batches = len(batches)
+    logger.info(f"Uploading {total_batches} batches with {num_workers} workers (timeout: {worker_timeout}s per worker)")
+    # Prepare worker arguments
+    worker_args = [
+        (i, batch, repo_id, token, total_batches, max_retries)
+        for i, batch in enumerate(batches)
+    ]
+    total_uploaded = 0
+    total_failed = 0
+    # Use multiprocessing pool
+    with Pool(processes=num_workers) as pool:
+        if show_progress:
+            try:
+                from tqdm import tqdm
+                results = list(tqdm(
+                    pool.imap_unordered(_upload_batch_worker, worker_args),
+                    total=total_batches,
+                    desc="Uploading batches",
+                    unit="batch",
+                ))
+            except ImportError:
+                results = []
+                for args in worker_args:
+                    try:
+                        result = pool.apply_async(_upload_batch_worker, (args,))
+                        uploaded, failed = result.get(timeout=worker_timeout)
+                        results.append((uploaded, failed))
+                    except Exception as e:
+                        logger.error(f"Worker timeout or error: {e}")
+                        results.append((0, len(args[1])))
+        else:
+            results = []
+            for args in worker_args:
+                try:
+                    result = pool.apply_async(_upload_batch_worker, (args,))
+                    uploaded, failed = result.get(timeout=worker_timeout)
+                    results.append((uploaded, failed))
+                except Exception as e:
+                    logger.error(f"Worker timeout or error: {e}")
+                    results.append((0, len(args[1])))
+    # Aggregate results
+    for uploaded, failed in results:
+        total_uploaded += uploaded
+        total_failed += failed
+    return True, f"Uploaded {total_uploaded}, skipped {skipped}, failed {total_failed} files"
+def upload_deploy_files(
+    space_repo: str,
+    overwrite: bool = False,
+    token: Optional[str] = None,
+) -> tuple[bool, str]:
+    """
+    Upload deploy files (Dockerfile, app.py, README.md) to Space repo.
+    Args:
+        space_repo: HF Space repo ID
+        overwrite: Overwrite existing files
+        token: HF token
+    Returns:
+        Tuple of (success, message)
+    """
+    from huggingface_hub import HfApi
+    from genarena.sync.hf_ops import upload_file
+    if token is None:
+        from genarena.sync.hf_ops import require_hf_token
+        token = require_hf_token()
+    api = HfApi(token=token)
+    # Get deploy directory
+    deploy_dir = os.path.dirname(os.path.abspath(__file__))
+    deploy_dir = os.path.join(os.path.dirname(deploy_dir), "deploy")
+    if not os.path.isdir(deploy_dir):
+        return False, f"Deploy directory not found: {deploy_dir}"
+    # Files to upload
+    deploy_files = [
+        ("Dockerfile", "Dockerfile"),
+        ("app.py", "genarena/deploy/app.py"),
+        ("README.md", "README.md"),
+    ]
+    # Get existing files
+    existing_files: set[str] = set()
+    if not overwrite:
+        try:
+            existing_files = set(
+                api.list_repo_files(repo_id=space_repo, repo_type="space")
+            )
+        except Exception:
+            pass
+    uploaded = 0
+    skipped = 0
+    failed = 0
+    for local_name, remote_path in deploy_files:
+        local_path = os.path.join(deploy_dir, local_name)
+        if not os.path.isfile(local_path):
+            logger.warning(f"Deploy file not found: {local_path}")
+            continue
+        if not overwrite and remote_path in existing_files:
+            skipped += 1
+            continue
+        success, msg = upload_file(
+            repo_id=space_repo,
+            local_path=local_path,
+            remote_path=remote_path,
+            token=token,
+            commit_message=f"Upload {remote_path}",
+            repo_type="space",
+        )
+        if success:
+            uploaded += 1
+        else:
+            failed += 1
+            logger.warning(f"Failed to upload {remote_path}: {msg}")
+    # Also upload the genarena package files needed for the Space
+    # We need to upload the entire genarena package
+    success, msg = upload_genarena_package(space_repo, token, overwrite)
+    if not success:
+        return False, f"Failed to upload genarena package: {msg}"
+    return True, f"Uploaded {uploaded}, skipped {skipped}, failed {failed} deploy files. {msg}"
+def upload_genarena_package(
+    space_repo: str,
+    token: str,
+    overwrite: bool = False,
+) -> tuple[bool, str]:
+    """
+    Upload the genarena package to the Space repo.
+    Args:
+        space_repo: HF Space repo ID
+        token: HF token
+        overwrite: Overwrite existing files
+    Returns:
+        Tuple of (success, message)
+    """
+    from huggingface_hub import HfApi
+    api = HfApi(token=token)
+    # Get genarena package directory
+    genarena_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    project_root = os.path.dirname(genarena_dir)
+    try:
+        # Upload pyproject.toml
+        pyproject_path = os.path.join(project_root, "pyproject.toml")
+        if os.path.isfile(pyproject_path):
+            api.upload_file(
+                repo_id=space_repo,
+                path_or_fileobj=pyproject_path,
+                path_in_repo="pyproject.toml",
+                repo_type="space",
+                commit_message="Upload pyproject.toml",
+            )
+        # Upload genarena package using upload_folder
+        api.upload_folder(
+            repo_id=space_repo,
+            folder_path=genarena_dir,
+            path_in_repo="genarena",
+            repo_type="space",
+            commit_message="[genarena deploy] Upload genarena package",
+            allow_patterns=["**/*.py", "**/*.html", "**/*.css", "**/*.js"],
+            ignore_patterns=["**/__pycache__/**", "**/.pytest_cache/**"],
+        )
+        return True, "Package uploaded successfully"
+    except Exception as e:
+        logger.error(f"Failed to upload package: {e}")
+        return False, str(e)

genarena/sync/git_ops.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""
+Git operations module for GenArena.
+This module provides Git version control functionality for arena data,
+including initialization, commit, remote configuration, and push operations.
+"""
+import logging
+import os
+import subprocess
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Patterns to exclude from Git tracking (models directories contain large image files)
+GITIGNORE_PATTERNS = [
+    "# GenArena: Exclude model output images (large files)",
+    "*/models/",
+    "",
+    "# Python cache",
+    "__pycache__/",
+    "*.pyc",
+    "",
+    "# OS files",
+    ".DS_Store",
+    "Thumbs.db",
+]
+def _run_git_command(
+    arena_dir: str,
+    args: list,
+    check: bool = True,
+    capture_output: bool = True,
+) -> subprocess.CompletedProcess:
+    """
+    Run a git command in the arena directory.
+    Args:
+        arena_dir: Path to the arena directory
+        args: Git command arguments (without 'git' prefix)
+        check: If True, raise exception on non-zero exit code
+        capture_output: If True, capture stdout and stderr
+    Returns:
+        CompletedProcess instance
+    Raises:
+        subprocess.CalledProcessError: If check=True and command fails
+    """
+    cmd = ["git"] + args
+    return subprocess.run(
+        cmd,
+        cwd=arena_dir,
+        check=check,
+        capture_output=capture_output,
+        text=True,
+    )
+def is_git_initialized(arena_dir: str) -> bool:
+    """
+    Check if the arena directory is a Git repository.
+    Args:
+        arena_dir: Path to the arena directory
+    Returns:
+        True if Git is initialized, False otherwise
+    """
+    git_dir = os.path.join(arena_dir, ".git")
+    return os.path.isdir(git_dir)
+def git_init(arena_dir: str) -> tuple[bool, str]:
+    """
+    Initialize a Git repository in the arena directory.
+    Args:
+        arena_dir: Path to the arena directory
+    Returns:
+        Tuple of (success, message)
+    """
+    if is_git_initialized(arena_dir):
+        return True, "Git repository already initialized"
+    # Ensure directory exists
+    os.makedirs(arena_dir, exist_ok=True)
+    try:
+        result = _run_git_command(arena_dir, ["init"])
+        logger.info(f"Initialized Git repository in {arena_dir}")
+        # Ensure .gitignore is set up
+        ensure_gitignore(arena_dir)
+        return True, "Git repository initialized successfully"
+    except subprocess.CalledProcessError as e:
+        error_msg = f"Failed to initialize Git repository: {e.stderr}"
+        logger.error(error_msg)
+        return False, error_msg
+def ensure_gitignore(arena_dir: str) -> tuple[bool, str]:
+    """
+    Create or update .gitignore file to exclude models directories.
+    Args:
+        arena_dir: Path to the arena directory
+    Returns:
+        Tuple of (success, message)
+    """
+    gitignore_path = os.path.join(arena_dir, ".gitignore")
+    existing_content = ""
+    if os.path.isfile(gitignore_path):
+        with open(gitignore_path, "r", encoding="utf-8") as f:
+            existing_content = f.read()
+    # Check if the key pattern already exists
+    key_pattern = "*/models/"
+    if key_pattern in existing_content:
+        return True, ".gitignore already contains required patterns"
+    # Append patterns to existing content
+    new_content = existing_content
+    if new_content and not new_content.endswith("\n"):
+        new_content += "\n"
+    if new_content:
+        new_content += "\n"
+    new_content += "\n".join(GITIGNORE_PATTERNS)
+    with open(gitignore_path, "w", encoding="utf-8") as f:
+        f.write(new_content)
+    logger.info(f"Updated .gitignore in {arena_dir}")
+    return True, ".gitignore updated successfully"
+def git_add_all(arena_dir: str) -> tuple[bool, str]:
+    """
+    Stage all changes in the arena directory (respecting .gitignore).
+    Args:
+        arena_dir: Path to the arena directory
+    Returns:
+        Tuple of (success, message)
+    """
+    if not is_git_initialized(arena_dir):
+        return False, "Git repository not initialized"
+    try:
+        _run_git_command(arena_dir, ["add", "-A"])
+        return True, "All changes staged"
+    except subprocess.CalledProcessError as e:
+        error_msg = f"Failed to stage changes: {e.stderr}"
+        logger.error(error_msg)
+        return False, error_msg
+def has_uncommitted_changes(arena_dir: str) -> bool:
+    """
+    Check if there are uncommitted changes in the repository.
+    Args:
+        arena_dir: Path to the arena directory
+    Returns:
+        True if there are uncommitted changes, False otherwise
+    """
+    if not is_git_initialized(arena_dir):
+        return False
+    try:
+        # Check for staged changes
+        result = _run_git_command(arena_dir, ["diff", "--cached", "--quiet"], check=False)
+        if result.returncode != 0:
+            return True
+        # Check for unstaged changes
+        result = _run_git_command(arena_dir, ["diff", "--quiet"], check=False)
+        if result.returncode != 0:
+            return True
+        # Check for untracked files (that aren't ignored)
+        result = _run_git_command(
+            arena_dir,
+            ["ls-files", "--others", "--exclude-standard"],
+            check=False
+        )
+        if result.stdout.strip():
+            return True
+        return False
+    except Exception as e:
+        logger.warning(f"Error checking for uncommitted changes: {e}")
+        return False
+def git_commit(
+    arena_dir: str,
+    message: Optional[str] = None,
+    command_name: Optional[str] = None,
+) -> tuple[bool, str]:
+    """
+    Commit staged changes.
+    Args:
+        arena_dir: Path to the arena directory
+        message: Custom commit message (optional)
+        command_name: Name of the command that triggered this commit (for auto-commit)
+    Returns:
+        Tuple of (success, message)
+    """
+    if not is_git_initialized(arena_dir):
+        return False, "Git repository not initialized"
+    # Stage all changes first
+    success, msg = git_add_all(arena_dir)
+    if not success:
+        return False, msg
+    # Check if there's anything to commit
+    result = _run_git_command(arena_dir, ["diff", "--cached", "--quiet"], check=False)
+    if result.returncode == 0:
+        return True, "Nothing to commit, working tree clean"
+    # Generate commit message
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    if message:
+        commit_msg = message
+    elif command_name:
+        commit_msg = f"[genarena] Auto commit after {command_name} at {timestamp}"
+    else:
+        commit_msg = f"[genarena] Auto commit at {timestamp}"
+    try:
+        _run_git_command(arena_dir, ["commit", "-m", commit_msg])
+        logger.info(f"Committed changes: {commit_msg}")
+        return True, f"Committed: {commit_msg}"
+    except subprocess.CalledProcessError as e:
+        error_msg = f"Failed to commit: {e.stderr}"
+        logger.error(error_msg)
+        return False, error_msg
+def git_remote_get_url(arena_dir: str, remote_name: str = "origin") -> Optional[str]:
+    """
+    Get the URL of a remote repository.
+    Args:
+        arena_dir: Path to the arena directory
+        remote_name: Name of the remote (default: origin)
+    Returns:
+        Remote URL or None if not configured
+    """
+    if not is_git_initialized(arena_dir):
+        return None
+    try:
+        result = _run_git_command(
+            arena_dir,
+            ["remote", "get-url", remote_name],
+            check=False
+        )
+        if result.returncode == 0:
+            return result.stdout.strip()
+        return None
+    except Exception:
+        return None
+def git_remote_add(
+    arena_dir: str,
+    url: str,
+    remote_name: str = "origin",
+    force: bool = False,
+) -> tuple[bool, str]:
+    """
+    Configure a remote repository.
+    Args:
+        arena_dir: Path to the arena directory
+        url: Remote repository URL
+        remote_name: Name of the remote (default: origin)
+        force: If True, overwrite existing remote URL
+    Returns:
+        Tuple of (success, message)
+    """
+    if not is_git_initialized(arena_dir):
+        return False, "Git repository not initialized"
+    existing_url = git_remote_get_url(arena_dir, remote_name)
+    if existing_url:
+        if existing_url == url:
+            return True, f"Remote '{remote_name}' already configured with this URL"
+        if not force:
+            return False, (
+                f"Remote '{remote_name}' already exists with URL: {existing_url}. "
+                f"Use --force to overwrite."
+            )
+        # Remove existing remote
+        try:
+            _run_git_command(arena_dir, ["remote", "remove", remote_name])
+        except subprocess.CalledProcessError as e:
+            return False, f"Failed to remove existing remote: {e.stderr}"
+    # Add remote
+    try:
+        _run_git_command(arena_dir, ["remote", "add", remote_name, url])
+        logger.info(f"Added remote '{remote_name}': {url}")
+        return True, f"Remote '{remote_name}' configured: {url}"
+    except subprocess.CalledProcessError as e:
+        error_msg = f"Failed to add remote: {e.stderr}"
+        logger.error(error_msg)
+        return False, error_msg
+def git_push(
+    arena_dir: str,
+    remote_name: str = "origin",
+    branch: Optional[str] = None,
+    set_upstream: bool = True,
+) -> tuple[bool, str]:
+    """
+    Push commits to the remote repository.
+    Args:
+        arena_dir: Path to the arena directory
+        remote_name: Name of the remote (default: origin)
+        branch: Branch name (default: current branch)
+        set_upstream: If True, set upstream tracking
+    Returns:
+        Tuple of (success, message)
+    """
+    if not is_git_initialized(arena_dir):
+        return False, "Git repository not initialized"
+    # Check if remote is configured
+    remote_url = git_remote_get_url(arena_dir, remote_name)
+    if not remote_url:
+        return False, f"Remote '{remote_name}' not configured. Use 'genarena git remote --url <url>' first."
+    # Get current branch if not specified
+    if not branch:
+        try:
+            result = _run_git_command(arena_dir, ["branch", "--show-current"])
+            branch = result.stdout.strip()
+            if not branch:
+                # Might be on a detached HEAD, try to get default branch
+                branch = "main"
+        except subprocess.CalledProcessError:
+            branch = "main"
+    # Push
+    try:
+        push_args = ["push"]
+        if set_upstream:
+            push_args.extend(["-u", remote_name, branch])
+        else:
+            push_args.extend([remote_name, branch])
+        _run_git_command(arena_dir, push_args)
+        logger.info(f"Pushed to {remote_name}/{branch}")
+        return True, f"Pushed to {remote_name}/{branch}"
+    except subprocess.CalledProcessError as e:
+        error_msg = f"Failed to push: {e.stderr}"
+        logger.error(error_msg)
+        return False, error_msg
+def git_sync(arena_dir: str) -> tuple[bool, str]:
+    """
+    Commit all changes and push to remote (one-click sync).
+    Args:
+        arena_dir: Path to the arena directory
+    Returns:
+        Tuple of (success, message)
+    """
+    if not is_git_initialized(arena_dir):
+        return False, "Git repository not initialized"
+    messages = []
+    # Commit changes
+    success, msg = git_commit(arena_dir)
+    messages.append(msg)
+    if not success and "Nothing to commit" not in msg:
+        return False, msg
+    # Push to remote
+    success, msg = git_push(arena_dir)
+    messages.append(msg)
+    if not success:
+        # If push fails due to no remote, still return partial success
+        if "not configured" in msg:
+            return True, f"{messages[0]} (push skipped: {msg})"
+        return False, msg
+    return True, " | ".join(messages)

genarena/sync/hf_ops.py ADDED Viewed

	@@ -0,0 +1,887 @@

+"""
+Huggingface operations module for GenArena.
+This module provides functionality for uploading and downloading
+arena data to/from Huggingface Dataset repositories.
+"""
+import logging
+import os
+import time
+import functools
+from typing import Any, Callable, Optional, TypeVar
+logger = logging.getLogger(__name__)
+# Type variable for retry decorator
+T = TypeVar("T")
+# Default retry configuration
+DEFAULT_MAX_RETRIES = 3
+DEFAULT_RETRY_DELAY = 2.0
+DEFAULT_RETRY_BACKOFF = 2.0  # Exponential backoff multiplier
+def retry_on_failure(
+    max_retries: int = DEFAULT_MAX_RETRIES,
+    delay: float = DEFAULT_RETRY_DELAY,
+    backoff: float = DEFAULT_RETRY_BACKOFF,
+    exceptions: tuple = (Exception,),
+) -> Callable:
+    """
+    Decorator that retries a function on failure with exponential backoff.
+    Args:
+        max_retries: Maximum number of retry attempts
+        delay: Initial delay between retries in seconds
+        backoff: Multiplier for delay after each retry
+        exceptions: Tuple of exception types to catch and retry
+    Returns:
+        Decorated function
+    """
+    def decorator(func: Callable[..., T]) -> Callable[..., T]:
+        @functools.wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> T:
+            current_delay = delay
+            last_exception = None
+            for attempt in range(max_retries + 1):
+                try:
+                    return func(*args, **kwargs)
+                except exceptions as e:
+                    last_exception = e
+                    if attempt < max_retries:
+                        logger.warning(
+                            f"{func.__name__} failed (attempt {attempt + 1}/{max_retries + 1}): {e}. "
+                            f"Retrying in {current_delay:.1f}s..."
+                        )
+                        time.sleep(current_delay)
+                        current_delay *= backoff
+                    else:
+                        logger.error(
+                            f"{func.__name__} failed after {max_retries + 1} attempts: {e}"
+                        )
+            # Re-raise the last exception
+            raise last_exception  # type: ignore
+        return wrapper
+    return decorator
+# Environment variable for HF token
+HF_TOKEN_ENV = "HF_TOKEN"
+def get_hf_token() -> Optional[str]:
+    """
+    Get the Huggingface token from environment variable.
+    Returns:
+        Token string or None if not set
+    """
+    return os.environ.get(HF_TOKEN_ENV)
+def require_hf_token() -> str:
+    """
+    Get the Huggingface token, raising an error if not set.
+    Returns:
+        Token string
+    Raises:
+        ValueError: If HF_TOKEN environment variable is not set
+    """
+    token = get_hf_token()
+    if not token:
+        raise ValueError(
+            f"Environment variable {HF_TOKEN_ENV} is not set. "
+            f"Please set it with your Huggingface token: "
+            f"export {HF_TOKEN_ENV}='your_token_here'"
+        )
+    return token
+def validate_dataset_repo(repo_id: str, token: Optional[str] = None) -> tuple[bool, str]:
+    """
+    Validate that a repository exists and is a Dataset type.
+    Args:
+        repo_id: Repository ID (e.g., "username/repo-name")
+        token: Huggingface token (optional for public repos)
+    Returns:
+        Tuple of (is_valid, message)
+    """
+    try:
+        from huggingface_hub import HfApi
+        from huggingface_hub.utils import RepositoryNotFoundError
+        api = HfApi(token=token)
+        try:
+            repo_info = api.repo_info(repo_id=repo_id, repo_type="dataset")
+            return True, f"Valid Dataset repository: {repo_id}"
+        except RepositoryNotFoundError:
+            # Try to check if it exists as a different type
+            try:
+                # Check if it's a model repo
+                api.repo_info(repo_id=repo_id, repo_type="model")
+                return False, (
+                    f"Repository '{repo_id}' exists but is a Model repository, not a Dataset. "
+                    f"Please create a Dataset repository on Huggingface."
+                )
+            except RepositoryNotFoundError:
+                pass
+            try:
+                # Check if it's a space repo
+                api.repo_info(repo_id=repo_id, repo_type="space")
+                return False, (
+                    f"Repository '{repo_id}' exists but is a Space repository, not a Dataset. "
+                    f"Please create a Dataset repository on Huggingface."
+                )
+            except RepositoryNotFoundError:
+                pass
+            return False, (
+                f"Repository '{repo_id}' does not exist. "
+                f"Please create a Dataset repository on Huggingface first: "
+                f"https://huggingface.co/new-dataset"
+            )
+    except ImportError:
+        return False, (
+            "huggingface_hub package is not installed. "
+            "Please install it with: pip install huggingface_hub"
+        )
+    except Exception as e:
+        return False, f"Error validating repository: {e}"
+def list_repo_files(
+    repo_id: str,
+    token: Optional[str] = None,
+    revision: str = "main",
+) -> tuple[bool, list[str], str]:
+    """
+    List all files in a Huggingface Dataset repository.
+    Args:
+        repo_id: Repository ID
+        token: Huggingface token (optional for public repos)
+        revision: Branch/revision name
+    Returns:
+        Tuple of (success, file_list, message)
+    """
+    try:
+        from huggingface_hub import HfApi
+        api = HfApi(token=token)
+        files = api.list_repo_files(
+            repo_id=repo_id,
+            repo_type="dataset",
+            revision=revision,
+        )
+        return True, list(files), f"Found {len(files)} files in {repo_id}"
+    except Exception as e:
+        return False, [], f"Error listing repository files: {e}"
+def get_repo_file_info(
+    repo_id: str,
+    token: Optional[str] = None,
+    revision: str = "main",
+) -> tuple[bool, list[dict], str]:
+    """
+    Get detailed file information from a Huggingface Dataset repository.
+    Args:
+        repo_id: Repository ID
+        token: Huggingface token (optional for public repos)
+        revision: Branch/revision name
+    Returns:
+        Tuple of (success, file_info_list, message)
+    """
+    try:
+        from huggingface_hub import HfApi
+        api = HfApi(token=token)
+        repo_info = api.repo_info(
+            repo_id=repo_id,
+            repo_type="dataset",
+            revision=revision,
+            files_metadata=True,
+        )
+        file_infos = []
+        if repo_info.siblings:
+            for sibling in repo_info.siblings:
+                file_infos.append({
+                    "path": sibling.rfilename,
+                    "size": sibling.size,
+                    "blob_id": sibling.blob_id,
+                })
+        return True, file_infos, f"Found {len(file_infos)} files in {repo_id}"
+    except Exception as e:
+        return False, [], f"Error getting repository info: {e}"
+def upload_file(
+    repo_id: str,
+    local_path: str,
+    remote_path: str,
+    token: str,
+    commit_message: Optional[str] = None,
+    max_retries: int = DEFAULT_MAX_RETRIES,
+    repo_type: str = "dataset",
+) -> tuple[bool, str]:
+    """
+    Upload a single file to a Huggingface repository with retry support.
+    Args:
+        repo_id: Repository ID
+        local_path: Local file path
+        remote_path: Path in the repository
+        token: Huggingface token
+        commit_message: Optional commit message
+        max_retries: Maximum number of retry attempts on failure
+        repo_type: Repository type ("dataset", "model", or "space")
+    Returns:
+        Tuple of (success, message)
+    """
+    from huggingface_hub import HfApi
+    api = HfApi(token=token)
+    if not commit_message:
+        commit_message = f"Upload {remote_path}"
+    @retry_on_failure(
+        max_retries=max_retries,
+        delay=DEFAULT_RETRY_DELAY,
+        backoff=DEFAULT_RETRY_BACKOFF,
+    )
+    def _do_upload() -> None:
+        api.upload_file(
+            path_or_fileobj=local_path,
+            path_in_repo=remote_path,
+            repo_id=repo_id,
+            repo_type=repo_type,
+            commit_message=commit_message,
+        )
+    try:
+        _do_upload()
+        return True, f"Uploaded {remote_path}"
+    except Exception as e:
+        return False, f"Error uploading file: {e}"
+def upload_files_batch(
+    repo_id: str,
+    file_mappings: list[tuple[str, str]],
+    token: str,
+    commit_message: Optional[str] = None,
+) -> tuple[bool, str]:
+    """
+    Upload multiple files in a single commit.
+    Args:
+        repo_id: Repository ID
+        file_mappings: List of (local_path, remote_path) tuples
+        token: Huggingface token
+        commit_message: Optional commit message
+    Returns:
+        Tuple of (success, message)
+    """
+    try:
+        from huggingface_hub import HfApi, CommitOperationAdd
+        api = HfApi(token=token)
+        if not commit_message:
+            commit_message = f"Upload {len(file_mappings)} files"
+        operations = [
+            CommitOperationAdd(
+                path_in_repo=remote_path,
+                path_or_fileobj=local_path,
+            )
+            for local_path, remote_path in file_mappings
+        ]
+        api.create_commit(
+            repo_id=repo_id,
+            repo_type="dataset",
+            operations=operations,
+            commit_message=commit_message,
+        )
+        return True, f"Uploaded {len(file_mappings)} files"
+    except Exception as e:
+        return False, f"Error uploading files: {e}"
+def download_file(
+    repo_id: str,
+    remote_path: str,
+    local_path: str,
+    token: Optional[str] = None,
+    revision: str = "main",
+) -> tuple[bool, str]:
+    """
+    Download a single file from a Huggingface Dataset repository.
+    Args:
+        repo_id: Repository ID
+        remote_path: Path in the repository
+        local_path: Local file path to save to
+        token: Huggingface token (optional for public repos)
+        revision: Branch/revision name
+    Returns:
+        Tuple of (success, message)
+    """
+    try:
+        from huggingface_hub import hf_hub_download
+        # Ensure local directory exists
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        # Download to a temp location first, then move
+        downloaded_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=remote_path,
+            repo_type="dataset",
+            revision=revision,
+            token=token,
+            local_dir=os.path.dirname(local_path),
+            local_dir_use_symlinks=False,
+        )
+        # If downloaded to a different path, copy to expected location
+        if downloaded_path != local_path:
+            import shutil
+            os.makedirs(os.path.dirname(local_path), exist_ok=True)
+            shutil.copy2(downloaded_path, local_path)
+        return True, f"Downloaded {remote_path}"
+    except Exception as e:
+        return False, f"Error downloading file: {e}"
+def check_file_exists(
+    repo_id: str,
+    remote_path: str,
+    token: Optional[str] = None,
+    revision: str = "main",
+) -> bool:
+    """
+    Check if a file exists in the repository.
+    Args:
+        repo_id: Repository ID
+        remote_path: Path in the repository
+        token: Huggingface token (optional for public repos)
+        revision: Branch/revision name
+    Returns:
+        True if file exists
+    """
+    try:
+        from huggingface_hub import HfApi
+        api = HfApi(token=token)
+        files = api.list_repo_files(
+            repo_id=repo_id,
+            repo_type="dataset",
+            revision=revision,
+        )
+        return remote_path in files
+    except Exception:
+        return False
+def format_file_size(size_bytes: Optional[int]) -> str:
+    """
+    Format file size in human-readable format.
+    Args:
+        size_bytes: Size in bytes
+    Returns:
+        Human-readable size string
+    """
+    if size_bytes is None:
+        return "Unknown"
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if abs(size_bytes) < 1024.0:
+            return f"{size_bytes:.1f} {unit}"
+        size_bytes /= 1024.0
+    return f"{size_bytes:.1f} PB"
+# =============================================================================
+# High-level operations
+# =============================================================================
+def upload_arena_data(
+    arena_dir: str,
+    repo_id: str,
+    subsets: Optional[list[str]] = None,
+    models: Optional[list[str]] = None,
+    experiments: Optional[list[str]] = None,
+    overwrite: bool = False,
+    show_progress: bool = True,
+    max_retries: int = DEFAULT_MAX_RETRIES,
+) -> tuple[bool, str]:
+    """
+    Upload arena data to a Huggingface Dataset repository.
+    This function:
+    1. Validates the repository exists and is a Dataset type
+    2. Collects files to upload based on filters
+    3. Packs directories into ZIP files
+    4. Uploads files with progress indication and retry on failure
+    Supports resume upload: by default (overwrite=False), already uploaded files
+    are automatically skipped, enabling resumable uploads after connection failures.
+    Args:
+        arena_dir: Path to the arena directory
+        repo_id: Huggingface repository ID
+        subsets: List of subsets to upload (None = all)
+        models: List of models to upload (None = all)
+        experiments: List of experiments (exp_name) to upload (None = all)
+        overwrite: If True, overwrite existing files; if False, skip existing (resume mode)
+        show_progress: If True, show progress bar
+        max_retries: Maximum number of retry attempts per file on failure
+    Returns:
+        Tuple of (success, message)
+    """
+    from genarena.sync.packer import (
+        collect_upload_tasks,
+        pack_model_dir,
+        pack_exp_dir,
+        TempPackingContext,
+        TaskType,
+    )
+    # Get token
+    try:
+        token = require_hf_token()
+    except ValueError as e:
+        return False, str(e)
+    # Validate repository
+    valid, msg = validate_dataset_repo(repo_id, token)
+    if not valid:
+        return False, msg
+    logger.info(f"Uploading to repository: {repo_id}")
+    # Collect upload tasks
+    tasks = collect_upload_tasks(arena_dir, subsets, models, experiments)
+    if not tasks:
+        return False, "No files to upload. Check arena_dir and filters."
+    logger.info(f"Found {len(tasks)} items to scan")
+    # Get existing files in repo (for overwrite check)
+    existing_files = set()
+    if not overwrite:
+        logger.info("Checking existing files in remote repository...")
+        success, files, _ = list_repo_files(repo_id, token)
+        if success:
+            existing_files = set(files)
+            logger.info(f"Found {len(existing_files)} files in remote repository")
+    # Pre-scan: categorize tasks into to_upload and to_skip
+    to_upload = []
+    to_skip = []
+    for task in tasks:
+        if not overwrite and task.remote_path in existing_files:
+            to_skip.append(task)
+        else:
+            to_upload.append(task)
+    # Display scan summary
+    logger.info(f"Scan summary: {len(to_upload)} to upload, {len(to_skip)} already exist (will skip)")
+    if to_skip:
+        logger.info("Already uploaded (will skip):")
+        for task in to_skip[:10]:
+            logger.info(f"  ✓ {task.remote_path}")
+        if len(to_skip) > 10:
+            logger.info(f"  ... and {len(to_skip) - 10} more")
+    if to_upload:
+        logger.info("To be uploaded:")
+        for task in to_upload[:10]:
+            logger.info(f"  → {task.remote_path}")
+        if len(to_upload) > 10:
+            logger.info(f"  ... and {len(to_upload) - 10} more")
+    if not to_upload:
+        return True, f"All {len(to_skip)} files already exist in repository. Nothing to upload."
+    # Process tasks (only those that need uploading)
+    uploaded = 0
+    skipped = len(to_skip)  # Pre-count skipped
+    failed = 0
+    errors = []
+    # Setup progress bar
+    if show_progress:
+        try:
+            from tqdm import tqdm
+            to_upload = tqdm(to_upload, desc="Uploading", unit="file")
+        except ImportError:
+            pass
+    with TempPackingContext() as ctx:
+        for task in to_upload:
+            try:
+                if task.task_type == TaskType.MODEL_ZIP:
+                    # Pack model directory
+                    zip_path = ctx.get_temp_zip_path(task.remote_path)
+                    success, msg = pack_model_dir(task.local_path, zip_path)
+                    if not success:
+                        errors.append(f"{task.name}: {msg}")
+                        failed += 1
+                        continue
+                    # Upload ZIP with retry
+                    success, msg = upload_file(
+                        repo_id, zip_path, task.remote_path, token,
+                        commit_message=f"[genarena] Upload model: {task.subset}/{task.name}",
+                        max_retries=max_retries,
+                    )
+                elif task.task_type == TaskType.EXP_ZIP:
+                    # Pack experiment directory
+                    zip_path = ctx.get_temp_zip_path(task.remote_path)
+                    success, msg = pack_exp_dir(task.local_path, zip_path)
+                    if not success:
+                        errors.append(f"{task.name}: {msg}")
+                        failed += 1
+                        continue
+                    # Upload ZIP with retry
+                    success, msg = upload_file(
+                        repo_id, zip_path, task.remote_path, token,
+                        commit_message=f"[genarena] Upload experiment: {task.subset}/{task.name}",
+                        max_retries=max_retries,
+                    )
+                elif task.task_type == TaskType.SMALL_FILE:
+                    # Upload small file directly with retry
+                    success, msg = upload_file(
+                        repo_id, task.local_path, task.remote_path, token,
+                        commit_message=f"[genarena] Upload {task.name}",
+                        max_retries=max_retries,
+                    )
+                else:
+                    success = False
+                    msg = f"Unknown task type: {task.task_type}"
+                if success:
+                    uploaded += 1
+                    logger.debug(f"Uploaded: {task.remote_path}")
+                else:
+                    errors.append(f"{task.name}: {msg}")
+                    failed += 1
+            except Exception as e:
+                errors.append(f"{task.name}: {e}")
+                failed += 1
+    # Summary
+    summary = f"Uploaded: {uploaded}, Skipped: {skipped}, Failed: {failed}"
+    if errors:
+        summary += f"\nErrors:\n" + "\n".join(f"  - {e}" for e in errors[:5])
+        if len(errors) > 5:
+            summary += f"\n  ... and {len(errors) - 5} more errors"
+    repo_url = f"https://huggingface.co/datasets/{repo_id}"
+    summary += f"\n\nRepository URL: {repo_url}"
+    success = failed == 0 or uploaded > 0
+    return success, summary
+def pull_arena_data(
+    arena_dir: str,
+    repo_id: str,
+    subsets: Optional[list[str]] = None,
+    models: Optional[list[str]] = None,
+    experiments: Optional[list[str]] = None,
+    revision: str = "main",
+    overwrite: bool = False,
+    show_progress: bool = True,
+) -> tuple[bool, str]:
+    """
+    Pull arena data from a Huggingface Dataset repository.
+    This function:
+    1. Validates the repository exists and is a Dataset type
+    2. Lists files in the repository
+    3. Filters based on subsets/models
+    4. Downloads and unpacks ZIP files
+    Args:
+        arena_dir: Path to the local arena directory
+        repo_id: Huggingface repository ID
+        subsets: List of subsets to download (None = all)
+        models: List of models to download (None = all)
+        experiments: List of experiments (exp_name) to download (None = all)
+        revision: Branch/revision to download from
+        overwrite: If True, overwrite existing files
+        show_progress: If True, show progress bar
+    Returns:
+        Tuple of (success, message)
+    """
+    import tempfile
+    import shutil
+    from genarena.sync.packer import (
+        collect_download_tasks,
+        unpack_zip,
+        TaskType,
+    )
+    # Get token (optional for public repos)
+    token = get_hf_token()
+    # Validate repository
+    valid, msg = validate_dataset_repo(repo_id, token)
+    if not valid:
+        return False, msg
+    logger.info(f"Pulling from repository: {repo_id} (revision: {revision})")
+    # List files in repository
+    success, repo_files, msg = list_repo_files(repo_id, token, revision)
+    if not success:
+        return False, msg
+    if not repo_files:
+        return False, "Repository is empty"
+    # Collect download tasks
+    tasks = collect_download_tasks(repo_files, arena_dir, subsets, models, experiments)
+    if not tasks:
+        return False, "No matching files to download. Check filters."
+    logger.info(f"Found {len(tasks)} items to download")
+    # Process tasks
+    downloaded = 0
+    skipped = 0
+    failed = 0
+    errors = []
+    # Setup progress bar
+    if show_progress:
+        try:
+            from tqdm import tqdm
+            tasks = tqdm(tasks, desc="Downloading", unit="file")
+        except ImportError:
+            pass
+    # Create temp directory for downloads
+    temp_dir = tempfile.mkdtemp(prefix="genarena_pull_")
+    try:
+        for task in tasks:
+            try:
+                if task.task_type in (TaskType.MODEL_ZIP, TaskType.EXP_ZIP):
+                    # Download ZIP to temp location
+                    temp_zip = os.path.join(temp_dir, os.path.basename(task.remote_path))
+                    success, msg = download_file(
+                        repo_id, task.remote_path, temp_zip, token, revision
+                    )
+                    if not success:
+                        errors.append(f"{task.name}: {msg}")
+                        failed += 1
+                        continue
+                    # Unpack ZIP
+                    success, msg = unpack_zip(temp_zip, task.local_path, overwrite)
+                    if not success:
+                        errors.append(f"{task.name}: {msg}")
+                        failed += 1
+                        continue
+                    downloaded += 1
+                    logger.debug(f"Downloaded and unpacked: {task.remote_path}")
+                elif task.task_type == TaskType.SMALL_FILE:
+                    # Check if file exists and skip if not overwriting
+                    if os.path.exists(task.local_path) and not overwrite:
+                        logger.debug(f"Skipping existing: {task.local_path}")
+                        skipped += 1
+                        continue
+                    # Download file directly
+                    success, msg = download_file(
+                        repo_id, task.remote_path, task.local_path, token, revision
+                    )
+                    if success:
+                        downloaded += 1
+                        logger.debug(f"Downloaded: {task.remote_path}")
+                    else:
+                        errors.append(f"{task.name}: {msg}")
+                        failed += 1
+            except Exception as e:
+                errors.append(f"{task.name}: {e}")
+                failed += 1
+    finally:
+        # Cleanup temp directory
+        shutil.rmtree(temp_dir, ignore_errors=True)
+    # Summary
+    summary = f"Downloaded: {downloaded}, Skipped: {skipped}, Failed: {failed}"
+    if errors:
+        summary += f"\nErrors:\n" + "\n".join(f"  - {e}" for e in errors[:5])
+        if len(errors) > 5:
+            summary += f"\n  ... and {len(errors) - 5} more errors"
+    success = failed == 0 or downloaded > 0
+    return success, summary
+def list_repo_contents(
+    repo_id: str,
+    revision: str = "main",
+) -> tuple[bool, str]:
+    """
+    List contents of a Huggingface Dataset repository.
+    Displays files organized by subset with size information.
+    Args:
+        repo_id: Huggingface repository ID
+        revision: Branch/revision name
+    Returns:
+        Tuple of (success, formatted_output)
+    """
+    # Get token (optional for public repos)
+    token = get_hf_token()
+    # Validate repository
+    valid, msg = validate_dataset_repo(repo_id, token)
+    if not valid:
+        return False, msg
+    # Get file info
+    success, file_infos, msg = get_repo_file_info(repo_id, token, revision)
+    if not success:
+        return False, msg
+    if not file_infos:
+        return True, f"Repository '{repo_id}' is empty"
+    # Organize by subset
+    subsets: dict[str, list[dict]] = {}
+    other_files: list[dict] = []
+    for info in file_infos:
+        path = info["path"]
+        parts = path.split("/")
+        if len(parts) >= 2:
+            subset = parts[0]
+            if subset not in subsets:
+                subsets[subset] = []
+            subsets[subset].append(info)
+        else:
+            other_files.append(info)
+    # Format output
+    lines = [
+        f"Repository: {repo_id}",
+        f"Revision: {revision}",
+        f"Total files: {len(file_infos)}",
+        "",
+    ]
+    total_size = sum(f.get("size", 0) or 0 for f in file_infos)
+    lines.append(f"Total size: {format_file_size(total_size)}")
+    lines.append("")
+    for subset in sorted(subsets.keys()):
+        files = subsets[subset]
+        subset_size = sum(f.get("size", 0) or 0 for f in files)
+        lines.append(f"=== {subset} ({len(files)} files, {format_file_size(subset_size)}) ===")
+        # Organize by type
+        models = []
+        experiments = []
+        other = []
+        for f in files:
+            path = f["path"]
+            if "/models/" in path:
+                models.append(f)
+            elif "/pk_logs/" in path:
+                experiments.append(f)
+            else:
+                other.append(f)
+        if models:
+            lines.append("  Models:")
+            for f in sorted(models, key=lambda x: x["path"]):
+                size = format_file_size(f.get("size"))
+                name = os.path.basename(f["path"])
+                lines.append(f"    - {name} ({size})")
+        if experiments:
+            lines.append("  Experiments:")
+            for f in sorted(experiments, key=lambda x: x["path"]):
+                size = format_file_size(f.get("size"))
+                name = os.path.basename(f["path"])
+                lines.append(f"    - {name} ({size})")
+        if other:
+            lines.append("  Other:")
+            for f in sorted(other, key=lambda x: x["path"]):
+                size = format_file_size(f.get("size"))
+                name = f["path"].split("/", 1)[1] if "/" in f["path"] else f["path"]
+                lines.append(f"    - {name} ({size})")
+        lines.append("")
+    if other_files:
+        lines.append("=== Other files ===")
+        for f in sorted(other_files, key=lambda x: x["path"]):
+            size = format_file_size(f.get("size"))
+            lines.append(f"  - {f['path']} ({size})")
+    return True, "\n".join(lines)

genarena/sync/init_ops.py ADDED Viewed

	@@ -0,0 +1,427 @@

+"""
+Initialization operations for GenArena.
+This module provides functionality for one-click initialization of arena
+directories, including downloading benchmark data and official arena data
+from HuggingFace repositories.
+"""
+import logging
+import os
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Default repository configurations
+DEFAULT_BENCHMARK_REPO = "rhli/genarena"
+DEFAULT_ARENA_REPO = "rhli/genarena-battlefield"
+def _format_size(size_bytes: int) -> str:
+    """Format file size in human-readable format."""
+    if size_bytes < 1024:
+        return f"{size_bytes} B"
+    elif size_bytes < 1024 * 1024:
+        return f"{size_bytes / 1024:.1f} KB"
+    elif size_bytes < 1024 * 1024 * 1024:
+        return f"{size_bytes / 1024 / 1024:.1f} MB"
+    else:
+        return f"{size_bytes / 1024 / 1024 / 1024:.2f} GB"
+def discover_repo_subsets(
+    repo_id: str,
+    token: Optional[str] = None,
+    revision: str = "main",
+) -> list[str]:
+    """
+    Discover available subsets in a HuggingFace repository.
+    Looks for directories containing parquet files or known subset patterns.
+    Args:
+        repo_id: HuggingFace repository ID
+        token: HuggingFace token (optional for public repos)
+        revision: Repository revision/branch
+    Returns:
+        List of subset names found in the repository
+    """
+    from huggingface_hub import HfApi
+    api = HfApi(token=token)
+    try:
+        files = api.list_repo_files(
+            repo_id=repo_id,
+            repo_type="dataset",
+            revision=revision,
+        )
+    except Exception as e:
+        logger.warning(f"Failed to list repo files: {e}")
+        return []
+    # Find directories that contain parquet files
+    subsets: set[str] = set()
+    for f in files:
+        # Look for patterns like: <subset>/data-*.parquet or <subset>/*.parquet
+        if f.endswith(".parquet"):
+            parts = f.split("/")
+            if len(parts) >= 2:
+                # First directory is the subset name
+                subset = parts[0]
+                # Skip hidden directories and common non-subset directories
+                if not subset.startswith(".") and subset not in ("data", "raw"):
+                    subsets.add(subset)
+    return sorted(subsets)
+def download_benchmark_data(
+    data_dir: str,
+    repo_id: str = DEFAULT_BENCHMARK_REPO,
+    subsets: Optional[list[str]] = None,
+    revision: str = "main",
+    overwrite: bool = False,
+    show_progress: bool = True,
+) -> tuple[bool, str, dict]:
+    """
+    Download benchmark Parquet data from HuggingFace.
+    Expected repository structure:
+        <subset>/data-00000-of-00001.parquet
+        <subset>/data-00001-of-00001.parquet
+        ...
+    Downloads to:
+        data_dir/<subset>/data-*.parquet
+    Args:
+        data_dir: Local directory to save data
+        repo_id: HuggingFace repository ID
+        subsets: List of subsets to download (None = all available)
+        revision: Repository revision/branch
+        overwrite: If True, overwrite existing files
+        show_progress: If True, show progress information
+    Returns:
+        Tuple of (success, message, stats_dict)
+    """
+    from huggingface_hub import HfApi, hf_hub_download
+    from genarena.sync.hf_ops import get_hf_token
+    token = get_hf_token()
+    api = HfApi(token=token)
+    stats = {
+        "downloaded_files": 0,
+        "skipped_files": 0,
+        "failed_files": 0,
+        "total_bytes": 0,
+        "subsets": {},
+    }
+    # Discover available subsets if not specified
+    if subsets is None:
+        logger.info(f"Discovering subsets in {repo_id}...")
+        subsets = discover_repo_subsets(repo_id, token, revision)
+        if not subsets:
+            return False, f"No subsets found in repository {repo_id}", stats
+        logger.info(f"Found subsets: {', '.join(subsets)}")
+    # List all files in the repo
+    try:
+        all_files = list(api.list_repo_files(
+            repo_id=repo_id,
+            repo_type="dataset",
+            revision=revision,
+        ))
+    except Exception as e:
+        return False, f"Failed to list repository files: {e}", stats
+    # Filter files for requested subsets
+    files_to_download: list[tuple[str, str]] = []  # (remote_path, local_path)
+    for subset in subsets:
+        subset_files = [
+            f for f in all_files
+            if f.startswith(f"{subset}/") and f.endswith(".parquet")
+        ]
+        if not subset_files:
+            logger.warning(f"No parquet files found for subset '{subset}'")
+            continue
+        stats["subsets"][subset] = {
+            "files": len(subset_files),
+            "bytes": 0,
+            "downloaded": 0,
+            "skipped": 0,
+        }
+        for remote_path in subset_files:
+            # Construct local path: data_dir/<subset>/filename.parquet
+            local_path = os.path.join(data_dir, remote_path)
+            files_to_download.append((remote_path, local_path))
+    if not files_to_download:
+        return False, "No parquet files found for the specified subsets", stats
+    # Create data directory
+    os.makedirs(data_dir, exist_ok=True)
+    # Download files
+    errors: list[str] = []
+    if show_progress:
+        try:
+            from tqdm import tqdm
+            files_iter = tqdm(files_to_download, desc="Downloading", unit="file")
+        except ImportError:
+            files_iter = files_to_download
+    else:
+        files_iter = files_to_download
+    for remote_path, local_path in files_iter:
+        subset = remote_path.split("/")[0]
+        # Check if file exists
+        if os.path.exists(local_path) and not overwrite:
+            logger.debug(f"Skipping existing file: {local_path}")
+            stats["skipped_files"] += 1
+            if subset in stats["subsets"]:
+                stats["subsets"][subset]["skipped"] += 1
+            continue
+        # Create directory
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        try:
+            # Download file
+            downloaded_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=remote_path,
+                repo_type="dataset",
+                revision=revision,
+                token=token,
+                local_dir=data_dir,
+                local_dir_use_symlinks=False,
+            )
+            # Get file size
+            file_size = os.path.getsize(downloaded_path)
+            stats["downloaded_files"] += 1
+            stats["total_bytes"] += file_size
+            if subset in stats["subsets"]:
+                stats["subsets"][subset]["downloaded"] += 1
+                stats["subsets"][subset]["bytes"] += file_size
+            logger.debug(f"Downloaded: {remote_path} ({_format_size(file_size)})")
+        except Exception as e:
+            logger.error(f"Failed to download {remote_path}: {e}")
+            errors.append(f"{remote_path}: {e}")
+            stats["failed_files"] += 1
+    # Build summary message
+    lines = [
+        f"Benchmark data download complete:",
+        f"  Downloaded: {stats['downloaded_files']} files ({_format_size(stats['total_bytes'])})",
+        f"  Skipped: {stats['skipped_files']} files (already exist)",
+        f"  Failed: {stats['failed_files']} files",
+    ]
+    if stats["subsets"]:
+        lines.append("  Subsets:")
+        for subset, info in stats["subsets"].items():
+            lines.append(
+                f"    - {subset}: {info['downloaded']} downloaded, "
+                f"{info['skipped']} skipped ({_format_size(info['bytes'])})"
+            )
+    if errors:
+        lines.append("  Errors:")
+        for err in errors[:5]:
+            lines.append(f"    - {err}")
+        if len(errors) > 5:
+            lines.append(f"    ... and {len(errors) - 5} more errors")
+    success = stats["failed_files"] == 0 or stats["downloaded_files"] > 0
+    return success, "\n".join(lines), stats
+def init_arena(
+    arena_dir: str = "./arena",
+    data_dir: str = "./data",
+    subsets: Optional[list[str]] = None,
+    benchmark_repo: str = DEFAULT_BENCHMARK_REPO,
+    arena_repo: str = DEFAULT_ARENA_REPO,
+    revision: str = "main",
+    overwrite: bool = False,
+    init_git: bool = False,
+    data_only: bool = False,
+    arena_only: bool = False,
+    show_progress: bool = True,
+) -> tuple[bool, str]:
+    """
+    One-click arena initialization.
+    This function:
+    1. Downloads benchmark Parquet data from HuggingFace (unless arena_only)
+    2. Downloads arena data (model outputs + logs) from HuggingFace (unless data_only)
+    3. Initializes Git repository in arena_dir (if init_git)
+    Args:
+        arena_dir: Path to arena directory
+        data_dir: Path to benchmark data directory
+        subsets: List of subsets to download (None = all available)
+        benchmark_repo: HuggingFace repo for benchmark data
+        arena_repo: HuggingFace repo for arena data
+        revision: HuggingFace revision/branch
+        overwrite: If True, overwrite existing files
+        init_git: If True, initialize Git repository in arena_dir
+        data_only: If True, only download benchmark data
+        arena_only: If True, only download arena data
+        show_progress: If True, show progress information
+    Returns:
+        Tuple of (success, summary_message)
+    """
+    from genarena.sync.hf_ops import pull_arena_data, get_hf_token
+    from genarena.sync.git_ops import git_init, is_git_initialized
+    lines: list[str] = []
+    all_success = True
+    benchmark_stats: dict = {}
+    arena_stats: dict = {}
+    # Resolve absolute paths
+    arena_dir = os.path.abspath(arena_dir)
+    data_dir = os.path.abspath(data_dir)
+    # Step 1: Download benchmark data
+    if not arena_only:
+        step_num = 1
+        total_steps = 2 if not data_only else 1
+        if init_git:
+            total_steps += 1
+        print(f"[Step {step_num}/{total_steps}] Downloading benchmark data from {benchmark_repo}...")
+        print(f"  Target directory: {data_dir}")
+        if subsets:
+            print(f"  Subsets: {', '.join(subsets)}")
+        print()
+        success, msg, benchmark_stats = download_benchmark_data(
+            data_dir=data_dir,
+            repo_id=benchmark_repo,
+            subsets=subsets,
+            revision=revision,
+            overwrite=overwrite,
+            show_progress=show_progress,
+        )
+        print(f"  {msg.replace(chr(10), chr(10) + '  ')}")
+        print()
+        if not success:
+            all_success = False
+            lines.append(f"Benchmark data download failed")
+        else:
+            lines.append(
+                f"Benchmark data: {benchmark_stats.get('downloaded_files', 0)} files "
+                f"({_format_size(benchmark_stats.get('total_bytes', 0))})"
+            )
+    # Step 2: Download arena data
+    if not data_only:
+        step_num = 1 if arena_only else 2
+        total_steps = 1 if arena_only else 2
+        if init_git:
+            total_steps += 1
+        print(f"[Step {step_num}/{total_steps}] Downloading arena data from {arena_repo}...")
+        print(f"  Target directory: {arena_dir}")
+        if subsets:
+            print(f"  Subsets: {', '.join(subsets)}")
+        print()
+        # Create arena directory
+        os.makedirs(arena_dir, exist_ok=True)
+        success, msg = pull_arena_data(
+            arena_dir=arena_dir,
+            repo_id=arena_repo,
+            subsets=subsets,
+            revision=revision,
+            overwrite=overwrite,
+            show_progress=show_progress,
+        )
+        print(f"  {msg.replace(chr(10), chr(10) + '  ')}")
+        print()
+        if not success:
+            all_success = False
+            lines.append(f"Arena data download failed: {msg}")
+        else:
+            lines.append(f"Arena data: downloaded to {arena_dir}")
+    # Step 3: Initialize Git
+    if init_git and not data_only:
+        step_num = total_steps
+        print(f"[Step {step_num}/{total_steps}] Initializing Git repository...")
+        if is_git_initialized(arena_dir):
+            print(f"  Git repository already initialized at {arena_dir}")
+            lines.append("Git: already initialized")
+        else:
+            success, msg = git_init(arena_dir)
+            print(f"  {msg}")
+            if success:
+                lines.append("Git: initialized")
+            else:
+                lines.append(f"Git: initialization failed - {msg}")
+        print()
+    # Build final summary
+    summary_lines = [
+        "=== Summary ===",
+    ]
+    if not arena_only:
+        summary_lines.append(f"Data directory:  {data_dir}")
+    if not data_only:
+        summary_lines.append(f"Arena directory: {arena_dir}")
+    if subsets:
+        summary_lines.append(f"Subsets:         {', '.join(subsets)}")
+    elif benchmark_stats.get("subsets"):
+        summary_lines.append(f"Subsets:         {', '.join(benchmark_stats['subsets'].keys())}")
+    for line in lines:
+        summary_lines.append(f"  {line}")
+    # Add next steps
+    summary_lines.append("")
+    summary_lines.append("Next steps:")
+    if not data_only:
+        summary_lines.append(f"  # View current status")
+        summary_lines.append(f"  genarena status --arena_dir {arena_dir} --data_dir {data_dir}")
+        summary_lines.append("")
+        summary_lines.append(f"  # Run evaluation battles")
+        example_subset = subsets[0] if subsets else "basic"
+        summary_lines.append(
+            f"  genarena run --arena_dir {arena_dir} --data_dir {data_dir} --subset {example_subset}"
+        )
+        summary_lines.append("")
+        summary_lines.append(f"  # View leaderboard")
+        summary_lines.append(f"  genarena leaderboard --arena_dir {arena_dir} --subset {example_subset}")
+    else:
+        summary_lines.append(f"  # Initialize arena directory")
+        summary_lines.append(f"  genarena init --arena_dir <path> --arena-only")
+    return all_success, "\n".join(summary_lines)

genarena/sync/packer.py ADDED Viewed

	@@ -0,0 +1,584 @@

+"""
+ZIP packing utilities for GenArena.
+This module provides functionality for packing and unpacking arena data
+for Huggingface upload/download operations.
+"""
+import logging
+import os
+import shutil
+import tempfile
+import zipfile
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Supported image file extensions for model directories
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".svg"}
+class TaskType(Enum):
+    """Type of upload/download task."""
+    MODEL_ZIP = "model_zip"        # ZIP file for experiment-scoped model images
+    EXP_ZIP = "exp_zip"            # ZIP file for experiment logs
+    SMALL_FILE = "small_file"      # Small file (state.json, README.md)
+@dataclass
+class PackTask:
+    """Represents a file packing/upload task."""
+    task_type: TaskType
+    local_path: str           # Local path (directory for ZIP, file for small files)
+    remote_path: str          # Remote path in the HF repo
+    subset: str               # Subset name
+    name: str                 # Model name or experiment name or file name
+@dataclass
+class UnpackTask:
+    """Represents a file unpacking/download task."""
+    task_type: TaskType
+    remote_path: str          # Remote path in the HF repo
+    local_path: str           # Local target path
+    subset: str               # Subset name
+    name: str                 # Model name or experiment name or file name
+def pack_directory(
+    source_dir: str,
+    output_zip: str,
+    file_extensions: Optional[set] = None,
+    max_depth: Optional[int] = None,
+) -> tuple[bool, str]:
+    """
+    Pack a directory into a ZIP file.
+    The directory name is preserved as the root folder inside the ZIP.
+    Symbolic links are followed and the actual file contents are packed.
+    Args:
+        source_dir: Path to the directory to pack
+        output_zip: Path to the output ZIP file
+        file_extensions: Optional set of file extensions to include (e.g., {".png", ".jpg"}).
+                        If None, all files are included. Extensions should be lowercase with dot.
+        max_depth: Optional maximum directory depth to traverse. None means unlimited.
+                   0 = only files directly in source_dir
+                   1 = files in source_dir and its immediate subdirectories
+                   etc.
+    Returns:
+        Tuple of (success, message)
+    """
+    if not os.path.isdir(source_dir):
+        return False, f"Source directory does not exist: {source_dir}"
+    # Resolve symlink if source_dir itself is a symlink
+    resolved_source = os.path.realpath(source_dir)
+    if not os.path.isdir(resolved_source):
+        return False, f"Source directory symlink target does not exist: {resolved_source}"
+    # Get the directory name to use as root in ZIP (use original name, not resolved)
+    dir_name = os.path.basename(source_dir.rstrip(os.sep))
+    try:
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(output_zip), exist_ok=True)
+        file_count = 0
+        with zipfile.ZipFile(output_zip, "w", zipfile.ZIP_DEFLATED) as zf:
+            # followlinks=True to traverse symlinked directories
+            for root, dirs, files in os.walk(resolved_source, followlinks=True):
+                # Calculate current depth relative to source
+                if max_depth is not None:
+                    rel_root = os.path.relpath(root, resolved_source)
+                    if rel_root == ".":
+                        current_depth = 0
+                    else:
+                        current_depth = len(rel_root.split(os.sep))
+                    # Skip directories beyond max_depth
+                    if current_depth > max_depth:
+                        dirs[:] = []  # Prevent further recursion
+                        continue
+                    # Stop recursion at max_depth
+                    if current_depth == max_depth:
+                        dirs[:] = []
+                for file in files:
+                    # Filter by extension if specified
+                    if file_extensions is not None:
+                        ext = os.path.splitext(file)[1].lower()
+                        if ext not in file_extensions:
+                            continue
+                    file_path = os.path.join(root, file)
+                    # Skip broken symlinks
+                    if os.path.islink(file_path) and not os.path.exists(file_path):
+                        logger.warning(f"Skipping broken symlink: {file_path}")
+                        continue
+                    # Calculate archive name: use original dir_name as root
+                    rel_to_resolved = os.path.relpath(file_path, resolved_source)
+                    archive_name = os.path.join(dir_name, rel_to_resolved)
+                    zf.write(file_path, archive_name)
+                    file_count += 1
+        if file_count == 0:
+            # Remove empty ZIP file
+            os.remove(output_zip)
+            return False, f"No files to pack in {source_dir}"
+        return True, f"Packed {source_dir} -> {output_zip} ({file_count} files)"
+    except Exception as e:
+        return False, f"Failed to pack directory: {e}"
+def pack_model_dir(model_dir: str, output_zip: str) -> tuple[bool, str]:
+    """
+    Pack a single model directory (containing images) into a ZIP file.
+    Only image files (png, jpg, jpeg, gif, webp, bmp, tiff, svg) are packed.
+    Only files directly under the model directory are included;
+    nested subdirectories (e.g., fail/) are excluded.
+    Args:
+        model_dir: Path to the model directory (e.g., arena_dir/basic/models/exp_001/model_a/)
+        output_zip: Path to the output ZIP file
+    Returns:
+        Tuple of (success, message)
+    """
+    return pack_directory(model_dir, output_zip, file_extensions=IMAGE_EXTENSIONS, max_depth=0)
+def pack_exp_dir(exp_dir: str, output_zip: str) -> tuple[bool, str]:
+    """
+    Pack an experiment directory (containing battle logs) into a ZIP file.
+    Args:
+        exp_dir: Path to the experiment directory (e.g., arena_dir/basic/pk_logs/exp_001/)
+        output_zip: Path to the output ZIP file
+    Returns:
+        Tuple of (success, message)
+    """
+    return pack_directory(exp_dir, output_zip)
+def unpack_zip(
+    zip_path: str,
+    target_dir: str,
+    overwrite: bool = False,
+) -> tuple[bool, str]:
+    """
+    Unpack a ZIP file to a target directory.
+    Args:
+        zip_path: Path to the ZIP file
+        target_dir: Target directory to extract to
+        overwrite: If True, overwrite existing files
+    Returns:
+        Tuple of (success, message)
+    """
+    if not os.path.isfile(zip_path):
+        return False, f"ZIP file does not exist: {zip_path}"
+    try:
+        os.makedirs(target_dir, exist_ok=True)
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            for member in zf.namelist():
+                target_path = os.path.join(target_dir, member)
+                # Check if file exists and skip if not overwriting
+                if os.path.exists(target_path) and not overwrite:
+                    logger.debug(f"Skipping existing file: {target_path}")
+                    continue
+                # Extract file
+                zf.extract(member, target_dir)
+        return True, f"Unpacked {zip_path} -> {target_dir}"
+    except Exception as e:
+        return False, f"Failed to unpack ZIP: {e}"
+def discover_subsets(arena_dir: str) -> list[str]:
+    """
+    Discover all subset directories in the arena directory.
+    A valid subset directory contains at least one of: models/, pk_logs/, arena/
+    Args:
+        arena_dir: Path to the arena directory
+    Returns:
+        List of subset names
+    """
+    subsets = []
+    if not os.path.isdir(arena_dir):
+        return subsets
+    for name in os.listdir(arena_dir):
+        subset_path = os.path.join(arena_dir, name)
+        if not os.path.isdir(subset_path):
+            continue
+        # Check if it looks like a subset directory
+        has_models = os.path.isdir(os.path.join(subset_path, "models"))
+        has_pk_logs = os.path.isdir(os.path.join(subset_path, "pk_logs"))
+        has_arena = os.path.isdir(os.path.join(subset_path, "arena"))
+        if has_models or has_pk_logs or has_arena:
+            subsets.append(name)
+    return sorted(subsets)
+def discover_models(arena_dir: str, subset: str) -> list[str]:
+    """
+    Discover all model names in a subset (v2 layout).
+    Args:
+        arena_dir: Path to the arena directory
+        subset: Subset name
+    Returns:
+        List of model names (globally unique across experiments)
+    """
+    from genarena.models import GlobalModelOutputManager
+    models_root = os.path.join(arena_dir, subset, "models")
+    if not os.path.isdir(models_root):
+        return []
+    try:
+        mgr = GlobalModelOutputManager(models_root)
+        return mgr.models
+    except Exception:
+        # For packer utilities, be conservative: return empty on scan failure.
+        return []
+def discover_model_experiments(arena_dir: str, subset: str) -> list[str]:
+    """
+    Discover experiment directories under a subset's models (v2 layout).
+    In v2, model outputs live under:
+        models/<exp_name>/<model_name>/...
+    This function returns exp_name directories that contain at least one model with images.
+    """
+    from genarena.models import GlobalModelOutputManager
+    models_root = os.path.join(arena_dir, subset, "models")
+    if not os.path.isdir(models_root):
+        return []
+    try:
+        mgr = GlobalModelOutputManager(models_root)
+        return mgr.experiments
+    except Exception:
+        return []
+def discover_experiments(arena_dir: str, subset: str) -> list[str]:
+    """
+    Discover all experiment directories in a subset's pk_logs.
+    Excludes .pk_logs_rm (deleted/orphaned logs).
+    Args:
+        arena_dir: Path to the arena directory
+        subset: Subset name
+    Returns:
+        List of experiment names
+    """
+    pk_logs_dir = os.path.join(arena_dir, subset, "pk_logs")
+    experiments = []
+    if not os.path.isdir(pk_logs_dir):
+        return experiments
+    for name in os.listdir(pk_logs_dir):
+        # Skip hidden directories and .pk_logs_rm
+        if name.startswith("."):
+            continue
+        exp_path = os.path.join(pk_logs_dir, name)
+        if os.path.isdir(exp_path):
+            experiments.append(name)
+    return sorted(experiments)
+def collect_upload_tasks(
+    arena_dir: str,
+    subsets: Optional[list[str]] = None,
+    models: Optional[list[str]] = None,
+    experiments: Optional[list[str]] = None,
+) -> list[PackTask]:
+    """
+    Collect all files/directories that need to be uploaded.
+    Args:
+        arena_dir: Path to the arena directory
+        subsets: List of subsets to include (None = all)
+        models: List of models to include (None = all)
+    Returns:
+        List of PackTask objects
+    """
+    tasks = []
+    # Discover subsets if not specified
+    all_subsets = discover_subsets(arena_dir)
+    target_subsets = subsets if subsets else all_subsets
+    for subset in target_subsets:
+        if subset not in all_subsets:
+            logger.warning(f"Subset '{subset}' not found in arena directory")
+            continue
+        subset_path = os.path.join(arena_dir, subset)
+        # Collect model directories (v2 layout: models/<exp_name>/<model_name>/):
+        # Each model is packed as a separate ZIP file.
+        # - Default: upload all models
+        # - If experiments filter is provided: only models under those exp_name
+        # - If models filter is provided: only those specific models
+        models_root = os.path.join(subset_path, "models")
+        all_model_exps = discover_model_experiments(arena_dir, subset)
+        target_model_exps: list[str]
+        if experiments:
+            target_model_exps = [e for e in experiments if e in all_model_exps]
+        else:
+            target_model_exps = all_model_exps
+        # Collect individual model directories
+        for exp in target_model_exps:
+            exp_model_path = os.path.join(subset_path, "models", exp)
+            if not os.path.isdir(exp_model_path):
+                continue
+            # List all model directories under this experiment
+            for model_name in os.listdir(exp_model_path):
+                model_path = os.path.join(exp_model_path, model_name)
+                if not os.path.isdir(model_path):
+                    continue
+                # Apply models filter if specified
+                if models and model_name not in models:
+                    continue
+                remote_path = f"{subset}/models/{exp}/{model_name}.zip"
+                tasks.append(PackTask(
+                    task_type=TaskType.MODEL_ZIP,
+                    local_path=model_path,
+                    remote_path=remote_path,
+                    subset=subset,
+                    name=f"{exp}/{model_name}",
+                ))
+        # Collect experiment directories (only if no model filter, or always)
+        # Note: pk_logs are always uploaded regardless of model filter
+        pk_experiments = discover_experiments(arena_dir, subset)
+        if experiments:
+            pk_experiments = [e for e in pk_experiments if e in set(experiments)]
+        for exp in pk_experiments:
+            exp_path = os.path.join(subset_path, "pk_logs", exp)
+            remote_path = f"{subset}/pk_logs/{exp}.zip"
+            tasks.append(PackTask(
+                task_type=TaskType.EXP_ZIP,
+                local_path=exp_path,
+                remote_path=remote_path,
+                subset=subset,
+                name=exp,
+            ))
+        # Collect small files
+        # state.json
+        state_path = os.path.join(subset_path, "arena", "state.json")
+        if os.path.isfile(state_path):
+            tasks.append(PackTask(
+                task_type=TaskType.SMALL_FILE,
+                local_path=state_path,
+                remote_path=f"{subset}/arena/state.json",
+                subset=subset,
+                name="state.json",
+            ))
+        # README.md
+        readme_path = os.path.join(subset_path, "README.md")
+        if os.path.isfile(readme_path):
+            tasks.append(PackTask(
+                task_type=TaskType.SMALL_FILE,
+                local_path=readme_path,
+                remote_path=f"{subset}/README.md",
+                subset=subset,
+                name="README.md",
+            ))
+    return tasks
+def collect_download_tasks(
+    repo_files: list[str],
+    arena_dir: str,
+    subsets: Optional[list[str]] = None,
+    models: Optional[list[str]] = None,
+    experiments: Optional[list[str]] = None,
+) -> list[UnpackTask]:
+    """
+    Collect files to download based on repo contents and filters.
+    Args:
+        repo_files: List of file paths in the HF repo
+        arena_dir: Local arena directory path
+        subsets: List of subsets to download (None = all)
+        models: List of models to download (None = all)
+        experiments: List of experiments to download (None = all)
+    Returns:
+        List of UnpackTask objects
+    """
+    tasks = []
+    for remote_path in repo_files:
+        # Parse the remote path to determine type
+        parts = remote_path.split("/")
+        if len(parts) < 2:
+            continue
+        subset = parts[0]
+        # Apply subset filter
+        if subsets and subset not in subsets:
+            continue
+        # Determine task type and apply filters
+        # New format: models/<exp_name>/<model_name>.zip
+        if len(parts) >= 4 and parts[1] == "models" and parts[3].endswith(".zip"):
+            exp_name = parts[2]
+            model_name = parts[3][:-4]  # Remove .zip
+            # Apply experiments filter
+            if experiments and exp_name not in experiments:
+                continue
+            # Apply models filter
+            if models and model_name not in models:
+                continue
+            local_path = os.path.join(arena_dir, subset, "models", exp_name)
+            tasks.append(UnpackTask(
+                task_type=TaskType.MODEL_ZIP,
+                remote_path=remote_path,
+                local_path=local_path,
+                subset=subset,
+                name=f"{exp_name}/{model_name}",
+            ))
+        # Legacy format: models/<exp_name>.zip (for backward compatibility)
+        elif len(parts) == 3 and parts[1] == "models" and parts[2].endswith(".zip"):
+            exp_name = parts[2][:-4]  # Remove .zip
+            # Apply experiments filter (legacy: models filter acts as exp filter)
+            exp_filter = experiments if experiments is not None else models
+            if exp_filter and exp_name not in exp_filter:
+                continue
+            local_path = os.path.join(arena_dir, subset, "models")
+            tasks.append(UnpackTask(
+                task_type=TaskType.MODEL_ZIP,
+                remote_path=remote_path,
+                local_path=local_path,
+                subset=subset,
+                name=exp_name,
+            ))
+        elif len(parts) >= 3 and parts[1] == "pk_logs" and parts[2].endswith(".zip"):
+            # Experiment ZIP file
+            exp_name = parts[2][:-4]  # Remove .zip
+            if experiments and exp_name not in experiments:
+                continue
+            local_path = os.path.join(arena_dir, subset, "pk_logs")
+            tasks.append(UnpackTask(
+                task_type=TaskType.EXP_ZIP,
+                remote_path=remote_path,
+                local_path=local_path,
+                subset=subset,
+                name=exp_name,
+            ))
+        elif len(parts) >= 3 and parts[1] == "arena" and parts[2] == "state.json":
+            # state.json
+            local_path = os.path.join(arena_dir, subset, "arena", "state.json")
+            tasks.append(UnpackTask(
+                task_type=TaskType.SMALL_FILE,
+                remote_path=remote_path,
+                local_path=local_path,
+                subset=subset,
+                name="state.json",
+            ))
+        elif len(parts) >= 2 and parts[1] == "README.md":
+            # README.md
+            local_path = os.path.join(arena_dir, subset, "README.md")
+            tasks.append(UnpackTask(
+                task_type=TaskType.SMALL_FILE,
+                remote_path=remote_path,
+                local_path=local_path,
+                subset=subset,
+                name="README.md",
+            ))
+    return tasks
+class TempPackingContext:
+    """
+    Context manager for temporary packing operations.
+    Creates a temporary directory for ZIP files and cleans up on exit.
+    """
+    def __init__(self):
+        self.temp_dir: Optional[str] = None
+    def __enter__(self) -> "TempPackingContext":
+        self.temp_dir = tempfile.mkdtemp(prefix="genarena_pack_")
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.temp_dir and os.path.isdir(self.temp_dir):
+            shutil.rmtree(self.temp_dir, ignore_errors=True)
+    def get_temp_zip_path(self, remote_path: str) -> str:
+        """
+        Get a temporary path for a ZIP file.
+        Args:
+            remote_path: The remote path (used to generate unique local path)
+        Returns:
+            Temporary file path
+        """
+        if not self.temp_dir:
+            raise RuntimeError("TempPackingContext not entered")
+        # Use the remote path structure for the temp file
+        temp_path = os.path.join(self.temp_dir, remote_path)
+        os.makedirs(os.path.dirname(temp_path), exist_ok=True)
+        return temp_path

genarena/sync/submit.py ADDED Viewed

	@@ -0,0 +1,833 @@

+"""
+Submission functionality for GenArena.
+This module provides the ability for users to submit their evaluation results
+to the official leaderboard via GitHub PR.
+Workflow:
+1. Validate local submission data
+2. Upload data to user's HuggingFace repository
+3. Create submission metadata JSON
+4. Fork official repo and create PR via GitHub CLI
+"""
+import hashlib
+import json
+import logging
+import os
+import subprocess
+import tempfile
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any, Optional
+from genarena import __version__
+from genarena.experiments import is_valid_exp_name
+from genarena.logs import load_battle_records
+from genarena.sync.packer import (
+    TempPackingContext,
+    pack_exp_dir,
+    pack_directory,
+    IMAGE_EXTENSIONS,
+)
+logger = logging.getLogger(__name__)
+# Default official submissions repository
+DEFAULT_OFFICIAL_REPO = "genarena/submissions"
+# URL to fetch official models list
+OFFICIAL_MODELS_URL = (
+    "https://raw.githubusercontent.com/genarena/submissions/main/official_models.json"
+)
+@dataclass
+class ValidationResult:
+    """Result of local submission validation."""
+    valid: bool
+    exp_name: str
+    subset: str
+    models: list[str] = field(default_factory=list)
+    new_models: list[str] = field(default_factory=list)
+    existing_models: list[str] = field(default_factory=list)
+    total_battles: int = 0
+    battles_per_pair: dict[str, int] = field(default_factory=dict)
+    elo_ratings: dict[str, float] = field(default_factory=dict)
+    elo_ci: dict[str, tuple[float, float]] = field(default_factory=dict)
+    evaluation_config: dict[str, Any] = field(default_factory=dict)
+    errors: list[str] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+@dataclass
+class UploadResult:
+    """Result of HuggingFace upload."""
+    hf_repo: str
+    hf_revision: str
+    models_zip_path: str
+    models_zip_sha256: str
+    models_zip_size: int
+    pk_logs_zip_path: str
+    pk_logs_zip_sha256: str
+    pk_logs_zip_size: int
+def fetch_official_models(subset: str, timeout: int = 10) -> set[str]:
+    """
+    Fetch official models list from GitHub.
+    Args:
+        subset: Subset name to get models for
+        timeout: Request timeout in seconds
+    Returns:
+        Set of official model names for the subset
+    """
+    import urllib.request
+    import urllib.error
+    try:
+        with urllib.request.urlopen(OFFICIAL_MODELS_URL, timeout=timeout) as resp:
+            data = json.load(resp)
+            return set(data.get("subsets", {}).get(subset, {}).get("models", []))
+    except urllib.error.URLError as e:
+        logger.warning(f"Failed to fetch official models list: {e}")
+        return set()
+    except json.JSONDecodeError as e:
+        logger.warning(f"Failed to parse official models list: {e}")
+        return set()
+    except Exception as e:
+        logger.warning(f"Unexpected error fetching official models: {e}")
+        return set()
+def _load_experiment_config(exp_dir: str) -> dict[str, Any]:
+    """Load experiment configuration from config.json."""
+    config_path = os.path.join(exp_dir, "config.json")
+    if not os.path.isfile(config_path):
+        return {}
+    try:
+        with open(config_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except (json.JSONDecodeError, IOError):
+        return {}
+def validate_local_submission(
+    arena_dir: str,
+    subset: str,
+    exp_name: str,
+    skip_official_check: bool = False,
+) -> ValidationResult:
+    """
+    Validate local submission data.
+    Checks:
+    1. exp_name format (_yyyymmdd suffix)
+    2. pk_logs directory exists and has battle records
+    3. models directory exists and has model outputs
+    4. All models in battles have corresponding outputs
+    5. At least one model is new (not in official leaderboard)
+    Args:
+        arena_dir: Arena directory path
+        subset: Subset name
+        exp_name: Experiment name
+        skip_official_check: Skip checking against official models (for testing)
+    Returns:
+        ValidationResult with validation status and details
+    """
+    errors: list[str] = []
+    warnings: list[str] = []
+    # Check exp_name format
+    if not is_valid_exp_name(exp_name):
+        errors.append(
+            f"Invalid exp_name format: '{exp_name}' must end with _yyyymmdd"
+        )
+    # Check paths exist
+    pk_logs_dir = os.path.join(arena_dir, subset, "pk_logs")
+    exp_dir = os.path.join(pk_logs_dir, exp_name)
+    models_root = os.path.join(arena_dir, subset, "models")
+    exp_models_dir = os.path.join(models_root, exp_name)
+    if not os.path.isdir(exp_dir):
+        errors.append(f"pk_logs directory not found: {exp_dir}")
+    if not os.path.isdir(exp_models_dir):
+        errors.append(f"models directory not found: {exp_models_dir}")
+    if errors:
+        return ValidationResult(
+            valid=False,
+            exp_name=exp_name,
+            subset=subset,
+            errors=errors,
+            warnings=warnings,
+        )
+    # Load battle records
+    records = load_battle_records(pk_logs_dir, exp_name=exp_name)
+    if not records:
+        errors.append("No battle records found in pk_logs")
+        return ValidationResult(
+            valid=False,
+            exp_name=exp_name,
+            subset=subset,
+            errors=errors,
+            warnings=warnings,
+        )
+    # Extract models and battle statistics
+    models: set[str] = set()
+    battles_per_pair: dict[str, int] = {}
+    for r in records:
+        model_a = r.get("model_a", "")
+        model_b = r.get("model_b", "")
+        if model_a and model_b:
+            models.add(model_a)
+            models.add(model_b)
+            # Ensure consistent pair key (sorted)
+            pair_key = f"{min(model_a, model_b)}_vs_{max(model_a, model_b)}"
+            battles_per_pair[pair_key] = battles_per_pair.get(pair_key, 0) + 1
+    models_list = sorted(models)
+    # Check model outputs exist
+    for model in models_list:
+        model_dir = os.path.join(exp_models_dir, model)
+        if not os.path.isdir(model_dir):
+            errors.append(f"Model output directory not found: {model_dir}")
+        else:
+            # Check if there are any images
+            has_images = False
+            for f in os.listdir(model_dir):
+                ext = os.path.splitext(f)[1].lower()
+                if ext in IMAGE_EXTENSIONS:
+                    has_images = True
+                    break
+            if not has_images:
+                errors.append(f"No image files found in model directory: {model_dir}")
+    # Check against official models
+    if not skip_official_check:
+        official_models = fetch_official_models(subset)
+        new_models = [m for m in models_list if m not in official_models]
+        existing_models = [m for m in models_list if m in official_models]
+        if not new_models:
+            errors.append(
+                "No new models found. All models already exist in official leaderboard. "
+                "Submissions must include at least one new model."
+            )
+    else:
+        new_models = models_list
+        existing_models = []
+        warnings.append("Skipped official models check (--skip-official-check)")
+    # Calculate ELO (only if no critical errors so far)
+    elo_ratings: dict[str, float] = {}
+    elo_ci: dict[str, tuple[float, float]] = {}
+    if not errors:
+        try:
+            from genarena.bt_elo import compute_bootstrap_bt_elo
+            battles = [
+                (r["model_a"], r["model_b"], r["final_winner"])
+                for r in records
+                if r.get("model_a") and r.get("model_b") and r.get("final_winner")
+            ]
+            if battles:
+                bt_result = compute_bootstrap_bt_elo(battles, num_bootstrap=100)
+                elo_ratings = bt_result.ratings
+                for model in models_list:
+                    if model in bt_result.ci_lower and model in bt_result.ci_upper:
+                        elo_ci[model] = (
+                            bt_result.ci_lower[model],
+                            bt_result.ci_upper[model],
+                        )
+        except Exception as e:
+            warnings.append(f"Failed to calculate ELO: {e}")
+    # Load evaluation config
+    evaluation_config = _load_experiment_config(exp_dir)
+    return ValidationResult(
+        valid=len(errors) == 0,
+        exp_name=exp_name,
+        subset=subset,
+        models=models_list,
+        new_models=new_models,
+        existing_models=existing_models,
+        total_battles=len(records),
+        battles_per_pair=battles_per_pair,
+        elo_ratings=elo_ratings,
+        elo_ci=elo_ci,
+        evaluation_config=evaluation_config,
+        errors=errors,
+        warnings=warnings,
+    )
+def upload_submission_data(
+    arena_dir: str,
+    subset: str,
+    exp_name: str,
+    hf_repo: str,
+    hf_revision: str = "main",
+    show_progress: bool = True,
+) -> UploadResult:
+    """
+    Pack and upload submission data to HuggingFace.
+    Args:
+        arena_dir: Arena directory path
+        subset: Subset name
+        exp_name: Experiment name
+        hf_repo: HuggingFace repository ID (e.g., "username/repo-name")
+        hf_revision: Repository revision/branch (default: "main")
+        show_progress: Show upload progress
+    Returns:
+        UploadResult with upload details
+    Raises:
+        RuntimeError: If upload fails
+    """
+    from huggingface_hub import HfApi
+    api = HfApi()
+    # Paths
+    exp_models_dir = os.path.join(arena_dir, subset, "models", exp_name)
+    exp_dir = os.path.join(arena_dir, subset, "pk_logs", exp_name)
+    with TempPackingContext() as ctx:
+        # Pack models
+        models_zip_path = ctx.get_temp_zip_path(f"{subset}/models/{exp_name}.zip")
+        success, msg = pack_directory(
+            exp_models_dir, models_zip_path, file_extensions=IMAGE_EXTENSIONS
+        )
+        if not success:
+            raise RuntimeError(f"Failed to pack models: {msg}")
+        # Calculate SHA256 for models
+        with open(models_zip_path, "rb") as f:
+            models_sha256 = hashlib.sha256(f.read()).hexdigest()
+        models_size = os.path.getsize(models_zip_path)
+        # Pack pk_logs
+        logs_zip_path = ctx.get_temp_zip_path(f"{subset}/pk_logs/{exp_name}.zip")
+        success, msg = pack_exp_dir(exp_dir, logs_zip_path)
+        if not success:
+            raise RuntimeError(f"Failed to pack pk_logs: {msg}")
+        # Calculate SHA256 for logs
+        with open(logs_zip_path, "rb") as f:
+            logs_sha256 = hashlib.sha256(f.read()).hexdigest()
+        logs_size = os.path.getsize(logs_zip_path)
+        # Upload to HF
+        hf_models_path = f"{subset}/models/{exp_name}.zip"
+        hf_logs_path = f"{subset}/pk_logs/{exp_name}.zip"
+        logger.info(f"Uploading models ZIP ({models_size / 1024 / 1024:.1f} MB)...")
+        api.upload_file(
+            path_or_fileobj=models_zip_path,
+            path_in_repo=hf_models_path,
+            repo_id=hf_repo,
+            repo_type="dataset",
+            revision=hf_revision,
+        )
+        logger.info(f"Uploading pk_logs ZIP ({logs_size / 1024 / 1024:.1f} MB)...")
+        api.upload_file(
+            path_or_fileobj=logs_zip_path,
+            path_in_repo=hf_logs_path,
+            repo_id=hf_repo,
+            repo_type="dataset",
+            revision=hf_revision,
+        )
+    return UploadResult(
+        hf_repo=hf_repo,
+        hf_revision=hf_revision,
+        models_zip_path=hf_models_path,
+        models_zip_sha256=models_sha256,
+        models_zip_size=models_size,
+        pk_logs_zip_path=hf_logs_path,
+        pk_logs_zip_sha256=logs_sha256,
+        pk_logs_zip_size=logs_size,
+    )
+def create_submission_metadata(
+    validation: ValidationResult,
+    upload: UploadResult,
+    github_username: str,
+    title: str = "",
+    description: str = "",
+    contact: str = "",
+) -> dict[str, Any]:
+    """
+    Create submission metadata JSON.
+    Args:
+        validation: ValidationResult from validate_local_submission
+        upload: UploadResult from upload_submission_data
+        github_username: GitHub username of submitter
+        title: Submission title
+        description: Submission description
+        contact: Optional contact email
+    Returns:
+        Submission metadata dictionary
+    """
+    # Generate submission ID
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
+    hash_input = f"{timestamp}{validation.exp_name}{github_username}"
+    short_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:8]
+    submission_id = f"sub_{timestamp}_{short_hash}"
+    # Build submitter info
+    submitter: dict[str, str] = {"github_username": github_username}
+    if contact:
+        submitter["contact"] = contact
+    # Build evaluation config (extract key fields)
+    eval_config = validation.evaluation_config
+    evaluation_config_summary = {
+        "judge_model": eval_config.get("judge_model", "unknown"),
+        "prompt_module": eval_config.get("prompt", "unknown"),
+        "temperature": eval_config.get("temperature", 0.0),
+        "position_debiasing": True,  # Always true in genarena
+    }
+    # Build model pairs list
+    model_pairs = [
+        [min(p.split("_vs_")[0], p.split("_vs_")[1]),
+         max(p.split("_vs_")[0], p.split("_vs_")[1])]
+        for p in validation.battles_per_pair.keys()
+    ]
+    return {
+        "schema_version": "1.0",
+        "submission_id": submission_id,
+        "created_at": datetime.now(timezone.utc).isoformat(),
+        "submitter": submitter,
+        "experiment": {
+            "exp_name": validation.exp_name,
+            "subset": validation.subset,
+            "models": validation.models,
+            "new_models": validation.new_models,
+            "existing_models": validation.existing_models,
+            "model_pairs": model_pairs,
+            "total_battles": validation.total_battles,
+            "battles_per_pair": validation.battles_per_pair,
+        },
+        "data_location": {
+            "hf_repo_id": upload.hf_repo,
+            "hf_revision": upload.hf_revision,
+            "files": {
+                "models_zip": {
+                    "path": upload.models_zip_path,
+                    "sha256": upload.models_zip_sha256,
+                    "size_bytes": upload.models_zip_size,
+                },
+                "pk_logs_zip": {
+                    "path": upload.pk_logs_zip_path,
+                    "sha256": upload.pk_logs_zip_sha256,
+                    "size_bytes": upload.pk_logs_zip_size,
+                },
+            },
+        },
+        "elo_preview": {
+            "ratings": validation.elo_ratings,
+            "ci_95": {m: list(ci) for m, ci in validation.elo_ci.items()},
+        },
+        "evaluation_config": evaluation_config_summary,
+        "title": title or f"Submit {validation.exp_name}",
+        "description": description,
+        "verification": {
+            "local_validation_passed": validation.valid,
+            "genarena_version": __version__,
+        },
+    }
+def _get_github_username() -> Optional[str]:
+    """Get GitHub username from gh CLI."""
+    try:
+        result = subprocess.run(
+            ["gh", "api", "user", "-q", ".login"],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        if result.returncode == 0:
+            return result.stdout.strip()
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        pass
+    return None
+def _check_gh_cli() -> tuple[bool, str]:
+    """Check if GitHub CLI is available and authenticated."""
+    try:
+        # Check if gh is installed
+        result = subprocess.run(
+            ["gh", "--version"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        if result.returncode != 0:
+            return False, "GitHub CLI (gh) is not installed"
+        # Check if authenticated
+        result = subprocess.run(
+            ["gh", "auth", "status"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        if result.returncode != 0:
+            return False, "GitHub CLI is not authenticated. Run 'gh auth login' first."
+        return True, "GitHub CLI is ready"
+    except FileNotFoundError:
+        return False, "GitHub CLI (gh) is not installed. Install it from https://cli.github.com"
+    except subprocess.TimeoutExpired:
+        return False, "GitHub CLI timed out"
+def _generate_pr_body(submission: dict[str, Any]) -> str:
+    """Generate PR description body."""
+    exp = submission["experiment"]
+    elo = submission["elo_preview"]["ratings"]
+    eval_config = submission["evaluation_config"]
+    body = f"""## Submission Details
+**Experiment:** `{exp['exp_name']}`
+**Subset:** `{exp['subset']}`
+**New Models:** {', '.join(f'`{m}`' for m in exp['new_models']) or 'None'}
+**Total Battles:** {exp['total_battles']:,}
+**Model Pairs:** {len(exp['model_pairs'])}
+### Evaluation Configuration
+| Setting | Value |
+|---------|-------|
+| Judge Model | `{eval_config.get('judge_model', 'N/A')}` |
+| Prompt Module | `{eval_config.get('prompt_module', 'N/A')}` |
+| Temperature | {eval_config.get('temperature', 'N/A')} |
+| Position Debiasing | {'Yes' if eval_config.get('position_debiasing') else 'No'} |
+### ELO Preview
+| Model | ELO | 95% CI |
+|-------|-----|--------|
+"""
+    ci_data = submission["elo_preview"].get("ci_95", {})
+    for model in sorted(elo.keys(), key=lambda m: -elo[m]):
+        ci = ci_data.get(model, [None, None])
+        ci_str = f"[{ci[0]:.1f}, {ci[1]:.1f}]" if ci[0] is not None else "N/A"
+        body += f"| {model} | {elo[model]:.1f} | {ci_str} |\n"
+    body += f"""
+### Data Location
+- **HuggingFace Repo:** `{submission['data_location']['hf_repo_id']}`
+- **Models ZIP:** `{submission['data_location']['files']['models_zip']['path']}`
+  - SHA256: `{submission['data_location']['files']['models_zip']['sha256'][:16]}...`
+  - Size: {submission['data_location']['files']['models_zip']['size_bytes'] / 1024 / 1024:.1f} MB
+- **Logs ZIP:** `{submission['data_location']['files']['pk_logs_zip']['path']}`
+  - SHA256: `{submission['data_location']['files']['pk_logs_zip']['sha256'][:16]}...`
+  - Size: {submission['data_location']['files']['pk_logs_zip']['size_bytes'] / 1024:.1f} KB
+### Description
+{submission.get('description') or submission.get('title', 'No description provided.')}
+---
+*Submitted via genarena v{submission['verification']['genarena_version']}*
+"""
+    return body
+def create_submission_pr(
+    submission: dict[str, Any],
+    official_repo: str = DEFAULT_OFFICIAL_REPO,
+    title: Optional[str] = None,
+) -> str:
+    """
+    Fork official repo and create PR with submission.
+    Args:
+        submission: Submission metadata dictionary
+        official_repo: Official submissions repository (default: genarena/submissions)
+        title: PR title (optional, auto-generated if not provided)
+    Returns:
+        PR URL
+    Raises:
+        RuntimeError: If PR creation fails
+    """
+    submission_id = submission["submission_id"]
+    filename = f"{submission_id}.json"
+    # Get GitHub username
+    gh_username = _get_github_username()
+    if not gh_username:
+        raise RuntimeError("Failed to get GitHub username. Ensure gh CLI is authenticated.")
+    # Fork the repo (idempotent - won't fail if already forked)
+    logger.info(f"Forking {official_repo}...")
+    result = subprocess.run(
+        ["gh", "repo", "fork", official_repo, "--clone=false"],
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+    # Note: fork may "fail" if already forked, but that's OK
+    # Clone forked repo to temp directory
+    with tempfile.TemporaryDirectory() as tmpdir:
+        fork_repo = f"{gh_username}/submissions"
+        logger.info(f"Cloning {fork_repo}...")
+        result = subprocess.run(
+            ["gh", "repo", "clone", fork_repo, tmpdir],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"Failed to clone fork: {result.stderr}")
+        # Sync with upstream
+        logger.info("Syncing with upstream...")
+        subprocess.run(
+            ["gh", "repo", "sync", fork_repo, "--source", official_repo],
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        # Pull latest changes
+        subprocess.run(
+            ["git", "pull", "origin", "main"],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        # Create branch
+        branch_name = f"submit/{submission_id}"
+        logger.info(f"Creating branch {branch_name}...")
+        result = subprocess.run(
+            ["git", "checkout", "-b", branch_name],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"Failed to create branch: {result.stderr}")
+        # Write submission file
+        submissions_dir = os.path.join(tmpdir, "submissions", "pending")
+        os.makedirs(submissions_dir, exist_ok=True)
+        submission_path = os.path.join(submissions_dir, filename)
+        with open(submission_path, "w", encoding="utf-8") as f:
+            json.dump(submission, f, indent=2, ensure_ascii=False)
+        # Commit
+        logger.info("Committing submission...")
+        subprocess.run(["git", "add", "."], cwd=tmpdir, check=True)
+        commit_msg = title or f"Submit {submission['experiment']['exp_name']}"
+        result = subprocess.run(
+            ["git", "commit", "-m", commit_msg],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"Failed to commit: {result.stderr}")
+        # Push
+        logger.info("Pushing to fork...")
+        result = subprocess.run(
+            ["git", "push", "-u", "origin", branch_name],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"Failed to push: {result.stderr}")
+        # Create PR
+        logger.info("Creating PR...")
+        pr_title = title or f"[Submission] {submission['experiment']['exp_name']}"
+        pr_body = _generate_pr_body(submission)
+        result = subprocess.run(
+            [
+                "gh", "pr", "create",
+                "--repo", official_repo,
+                "--head", f"{gh_username}:{branch_name}",
+                "--title", pr_title,
+                "--body", pr_body,
+            ],
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"Failed to create PR: {result.stderr}")
+        pr_url = result.stdout.strip()
+        return pr_url
+def print_validation_summary(validation: ValidationResult) -> None:
+    """Print validation summary to console."""
+    print("\nValidation Results:")
+    print("-" * 40)
+    if validation.valid:
+        print("Status: PASSED")
+    else:
+        print("Status: FAILED")
+    print(f"\nExperiment: {validation.exp_name}")
+    print(f"Subset: {validation.subset}")
+    print(f"Models: {len(validation.models)}")
+    print(f"  New models: {', '.join(validation.new_models) or 'None'}")
+    print(f"  Existing models: {', '.join(validation.existing_models) or 'None'}")
+    print(f"Total battles: {validation.total_battles:,}")
+    print(f"Model pairs: {len(validation.battles_per_pair)}")
+    if validation.elo_ratings:
+        print("\nELO Preview:")
+        sorted_models = sorted(
+            validation.elo_ratings.keys(),
+            key=lambda m: -validation.elo_ratings[m]
+        )
+        for model in sorted_models:
+            elo = validation.elo_ratings[model]
+            ci = validation.elo_ci.get(model)
+            ci_str = f" [{ci[0]:.1f}, {ci[1]:.1f}]" if ci else ""
+            new_marker = " (new)" if model in validation.new_models else ""
+            print(f"  {model}: {elo:.1f}{ci_str}{new_marker}")
+    if validation.evaluation_config:
+        config = validation.evaluation_config
+        print("\nEvaluation Config:")
+        print(f"  Judge model: {config.get('judge_model', 'N/A')}")
+        print(f"  Prompt: {config.get('prompt', 'N/A')}")
+        print(f"  Temperature: {config.get('temperature', 'N/A')}")
+    if validation.warnings:
+        print("\nWarnings:")
+        for w in validation.warnings:
+            print(f"  - {w}")
+    if validation.errors:
+        print("\nErrors:")
+        for e in validation.errors:
+            print(f"  - {e}")
+    print()
+def generate_official_models_json(
+    arena_dir: str,
+    output_path: Optional[str] = None,
+) -> dict[str, Any]:
+    """
+    Generate official_models.json from arena state files.
+    This function scans all subsets in the arena directory and extracts
+    the list of models from each subset's state.json file.
+    Args:
+        arena_dir: Path to the official arena directory
+        output_path: Optional path to write the JSON file
+    Returns:
+        The official_models.json content as a dictionary
+    """
+    from genarena.sync.packer import discover_subsets
+    from genarena.state import load_state
+    result: dict[str, Any] = {
+        "last_updated": datetime.now(timezone.utc).isoformat(),
+        "description": "List of models currently on the official GenArena leaderboard",
+        "subsets": {},
+    }
+    # Discover all subsets
+    subsets = discover_subsets(arena_dir)
+    for subset in subsets:
+        state_path = os.path.join(arena_dir, subset, "arena", "state.json")
+        if not os.path.isfile(state_path):
+            continue
+        state = load_state(state_path)
+        if not state.models:
+            continue
+        # Get sorted list of model names
+        models = sorted(state.models.keys())
+        result["subsets"][subset] = {
+            "models": models,
+            "model_count": len(models),
+            "total_battles": state.total_battles,
+        }
+    # Write to file if output_path specified
+    if output_path:
+        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, indent=2, ensure_ascii=False)
+        logger.info(f"Wrote official_models.json to {output_path}")
+    return result
+def print_official_models_summary(data: dict[str, Any]) -> None:
+    """Print summary of official models."""
+    print("\n=== Official Models ===\n")
+    print(f"Last Updated: {data.get('last_updated', 'N/A')}")
+    print()
+    subsets = data.get("subsets", {})
+    if not subsets:
+        print("No subsets found.")
+        return
+    for subset, info in sorted(subsets.items()):
+        models = info.get("models", [])
+        print(f"Subset: {subset}")
+        print(f"  Models ({len(models)}):")
+        for model in models:
+            print(f"    - {model}")
+        print(f"  Total Battles: {info.get('total_battles', 0):,}")
+        print()

genarena/validation/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""Validation module for GenArena submissions."""
+from genarena.validation.schema import (
+    SUBMISSION_SCHEMA,
+    validate_submission_schema,
+)
+from genarena.validation.validator import (
+    validate_submission_file,
+    validate_submission_data,
+    ValidationReport,
+)
+__all__ = [
+    "SUBMISSION_SCHEMA",
+    "validate_submission_schema",
+    "validate_submission_file",
+    "validate_submission_data",
+    "ValidationReport",
+]

genarena/validation/schema.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+JSON Schema definition for GenArena submissions.
+This schema defines the structure of submission metadata files
+that are submitted via GitHub PR to the official leaderboard.
+"""
+from typing import Any
+# JSON Schema for submission metadata
+SUBMISSION_SCHEMA: dict[str, Any] = {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "title": "GenArena Submission",
+    "description": "Metadata for a GenArena evaluation submission",
+    "type": "object",
+    "required": [
+        "schema_version",
+        "submission_id",
+        "created_at",
+        "submitter",
+        "experiment",
+        "data_location",
+        "elo_preview",
+    ],
+    "properties": {
+        "schema_version": {
+            "type": "string",
+            "description": "Schema version (e.g., '1.0')",
+            "pattern": "^\\d+\\.\\d+$",
+        },
+        "submission_id": {
+            "type": "string",
+            "description": "Unique submission identifier",
+            "pattern": "^sub_\\d{8}T\\d{6}_[a-f0-9]{8}$",
+        },
+        "created_at": {
+            "type": "string",
+            "description": "ISO 8601 timestamp of submission creation",
+            "format": "date-time",
+        },
+        "submitter": {
+            "type": "object",
+            "required": ["github_username"],
+            "properties": {
+                "github_username": {
+                    "type": "string",
+                    "description": "GitHub username of submitter",
+                    "minLength": 1,
+                },
+                "contact": {
+                    "type": "string",
+                    "description": "Optional contact email",
+                    "format": "email",
+                },
+            },
+        },
+        "experiment": {
+            "type": "object",
+            "required": [
+                "exp_name",
+                "subset",
+                "models",
+                "new_models",
+                "total_battles",
+            ],
+            "properties": {
+                "exp_name": {
+                    "type": "string",
+                    "description": "Experiment name (must end with _yyyymmdd)",
+                    "pattern": "^.+_\\d{8}$",
+                },
+                "subset": {
+                    "type": "string",
+                    "description": "Subset name (e.g., 'basic')",
+                    "minLength": 1,
+                },
+                "models": {
+                    "type": "array",
+                    "description": "List of all model names in the experiment",
+                    "items": {"type": "string"},
+                    "minItems": 2,
+                },
+                "new_models": {
+                    "type": "array",
+                    "description": "List of new model names (not in official leaderboard)",
+                    "items": {"type": "string"},
+                    "minItems": 1,
+                },
+                "existing_models": {
+                    "type": "array",
+                    "description": "List of existing model names (already in official)",
+                    "items": {"type": "string"},
+                },
+                "model_pairs": {
+                    "type": "array",
+                    "description": "List of model pairs evaluated",
+                    "items": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "minItems": 2,
+                        "maxItems": 2,
+                    },
+                },
+                "total_battles": {
+                    "type": "integer",
+                    "description": "Total number of battles",
+                    "minimum": 1,
+                },
+                "battles_per_pair": {
+                    "type": "object",
+                    "description": "Battle count per model pair",
+                    "additionalProperties": {"type": "integer"},
+                },
+            },
+        },
+        "data_location": {
+            "type": "object",
+            "required": ["hf_repo_id", "files"],
+            "properties": {
+                "hf_repo_id": {
+                    "type": "string",
+                    "description": "HuggingFace repository ID",
+                    "pattern": "^[\\w.-]+/[\\w.-]+$",
+                },
+                "hf_revision": {
+                    "type": "string",
+                    "description": "HuggingFace revision/branch",
+                    "default": "main",
+                },
+                "files": {
+                    "type": "object",
+                    "required": ["models_zip", "pk_logs_zip"],
+                    "properties": {
+                        "models_zip": {
+                            "$ref": "#/$defs/file_info",
+                        },
+                        "pk_logs_zip": {
+                            "$ref": "#/$defs/file_info",
+                        },
+                    },
+                },
+            },
+        },
+        "elo_preview": {
+            "type": "object",
+            "required": ["ratings"],
+            "properties": {
+                "ratings": {
+                    "type": "object",
+                    "description": "ELO ratings by model",
+                    "additionalProperties": {"type": "number"},
+                },
+                "ci_95": {
+                    "type": "object",
+                    "description": "95% confidence intervals by model",
+                    "additionalProperties": {
+                        "type": "array",
+                        "items": {"type": "number"},
+                        "minItems": 2,
+                        "maxItems": 2,
+                    },
+                },
+            },
+        },
+        "evaluation_config": {
+            "type": "object",
+            "description": "Evaluation configuration used",
+            "properties": {
+                "judge_model": {
+                    "type": "string",
+                    "description": "VLM judge model name",
+                },
+                "prompt_module": {
+                    "type": "string",
+                    "description": "Prompt module name",
+                },
+                "temperature": {
+                    "type": "number",
+                    "description": "VLM temperature",
+                    "minimum": 0,
+                },
+                "position_debiasing": {
+                    "type": "boolean",
+                    "description": "Whether position debiasing was used",
+                },
+            },
+        },
+        "title": {
+            "type": "string",
+            "description": "Submission title",
+        },
+        "description": {
+            "type": "string",
+            "description": "Submission description",
+        },
+        "verification": {
+            "type": "object",
+            "properties": {
+                "local_validation_passed": {
+                    "type": "boolean",
+                    "description": "Whether local validation passed",
+                },
+                "genarena_version": {
+                    "type": "string",
+                    "description": "genarena version used for submission",
+                },
+            },
+        },
+    },
+    "$defs": {
+        "file_info": {
+            "type": "object",
+            "required": ["path", "sha256", "size_bytes"],
+            "properties": {
+                "path": {
+                    "type": "string",
+                    "description": "File path in HF repo",
+                },
+                "sha256": {
+                    "type": "string",
+                    "description": "SHA256 checksum",
+                    "pattern": "^[a-f0-9]{64}$",
+                },
+                "size_bytes": {
+                    "type": "integer",
+                    "description": "File size in bytes",
+                    "minimum": 1,
+                },
+            },
+        },
+    },
+}
+def validate_submission_schema(submission: dict[str, Any]) -> tuple[bool, list[str]]:
+    """
+    Validate submission against JSON schema.
+    Args:
+        submission: Submission metadata dictionary
+    Returns:
+        Tuple of (is_valid, list of error messages)
+    """
+    try:
+        import jsonschema
+    except ImportError:
+        # If jsonschema is not available, do basic validation
+        return _basic_validation(submission)
+    errors: list[str] = []
+    try:
+        jsonschema.validate(instance=submission, schema=SUBMISSION_SCHEMA)
+        return True, []
+    except jsonschema.ValidationError as e:
+        errors.append(f"Schema validation error: {e.message}")
+        if e.path:
+            errors.append(f"  at path: {'.'.join(str(p) for p in e.path)}")
+        return False, errors
+    except jsonschema.SchemaError as e:
+        errors.append(f"Schema error: {e.message}")
+        return False, errors
+def _basic_validation(submission: dict[str, Any]) -> tuple[bool, list[str]]:
+    """Basic validation without jsonschema library."""
+    errors: list[str] = []
+    required_fields = [
+        "schema_version",
+        "submission_id",
+        "created_at",
+        "submitter",
+        "experiment",
+        "data_location",
+        "elo_preview",
+    ]
+    for field in required_fields:
+        if field not in submission:
+            errors.append(f"Missing required field: {field}")
+    if errors:
+        return False, errors
+    # Check submitter
+    if "github_username" not in submission.get("submitter", {}):
+        errors.append("Missing submitter.github_username")
+    # Check experiment
+    exp = submission.get("experiment", {})
+    exp_required = ["exp_name", "subset", "models", "new_models", "total_battles"]
+    for field in exp_required:
+        if field not in exp:
+            errors.append(f"Missing experiment.{field}")
+    # Check new_models is not empty
+    if not exp.get("new_models"):
+        errors.append("experiment.new_models must have at least one model")
+    # Check data_location
+    data_loc = submission.get("data_location", {})
+    if "hf_repo_id" not in data_loc:
+        errors.append("Missing data_location.hf_repo_id")
+    if "files" not in data_loc:
+        errors.append("Missing data_location.files")
+    else:
+        files = data_loc.get("files", {})
+        for zip_type in ["models_zip", "pk_logs_zip"]:
+            if zip_type not in files:
+                errors.append(f"Missing data_location.files.{zip_type}")
+            else:
+                file_info = files[zip_type]
+                for field in ["path", "sha256", "size_bytes"]:
+                    if field not in file_info:
+                        errors.append(f"Missing data_location.files.{zip_type}.{field}")
+    # Check elo_preview
+    if "ratings" not in submission.get("elo_preview", {}):
+        errors.append("Missing elo_preview.ratings")
+    return len(errors) == 0, errors

genarena/validation/validator.py ADDED Viewed

	@@ -0,0 +1,325 @@

+"""
+Validator for GenArena submissions.
+This module provides functions to validate submission files,
+including downloading and verifying data from HuggingFace.
+Used by the GitHub Actions bot for automated validation.
+"""
+import hashlib
+import json
+import logging
+import os
+import tempfile
+import zipfile
+from dataclasses import dataclass, field
+from typing import Any, Optional
+from genarena.validation.schema import validate_submission_schema
+logger = logging.getLogger(__name__)
+@dataclass
+class ValidationCheck:
+    """Single validation check result."""
+    name: str
+    passed: bool
+    error: Optional[str] = None
+@dataclass
+class ValidationReport:
+    """Complete validation report for a submission."""
+    status: str  # "success" or "failed"
+    submission_id: str = ""
+    exp_name: str = ""
+    subset: str = ""
+    models: list[str] = field(default_factory=list)
+    new_models: list[str] = field(default_factory=list)
+    total_battles: int = 0
+    checks: list[ValidationCheck] = field(default_factory=list)
+    elo_comparison: dict[str, dict[str, float]] = field(default_factory=dict)
+    errors: list[str] = field(default_factory=list)
+    def add_check(self, name: str, passed: bool, error: Optional[str] = None) -> None:
+        """Add a validation check result."""
+        self.checks.append(ValidationCheck(name=name, passed=passed, error=error))
+        if not passed:
+            self.status = "failed"
+            if error:
+                self.errors.append(f"{name}: {error}")
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "status": self.status,
+            "submission_id": self.submission_id,
+            "exp_name": self.exp_name,
+            "subset": self.subset,
+            "models": self.models,
+            "new_models": self.new_models,
+            "total_battles": self.total_battles,
+            "checks": [
+                {"name": c.name, "passed": c.passed, "error": c.error}
+                for c in self.checks
+            ],
+            "elo_comparison": self.elo_comparison,
+            "errors": self.errors,
+        }
+def validate_submission_file(
+    submission_path: str,
+    official_models_path: Optional[str] = None,
+    download_data: bool = True,
+) -> ValidationReport:
+    """
+    Validate a submission JSON file.
+    This is the main entry point for validating submissions,
+    used by the GitHub Actions bot.
+    Args:
+        submission_path: Path to submission JSON file
+        official_models_path: Path to official_models.json (optional)
+        download_data: Whether to download and verify data from HF
+    Returns:
+        ValidationReport with all check results
+    """
+    report = ValidationReport(status="success")
+    # 1. Load and parse JSON
+    try:
+        with open(submission_path, "r", encoding="utf-8") as f:
+            submission = json.load(f)
+        report.add_check("JSON parse", True)
+    except json.JSONDecodeError as e:
+        report.add_check("JSON parse", False, str(e))
+        return report
+    except IOError as e:
+        report.add_check("File read", False, str(e))
+        return report
+    # 2. Schema validation
+    is_valid, schema_errors = validate_submission_schema(submission)
+    if is_valid:
+        report.add_check("Schema validation", True)
+    else:
+        for err in schema_errors:
+            report.add_check("Schema validation", False, err)
+        return report
+    # Extract basic info
+    report.submission_id = submission.get("submission_id", "")
+    exp = submission.get("experiment", {})
+    report.exp_name = exp.get("exp_name", "")
+    report.subset = exp.get("subset", "")
+    report.models = exp.get("models", [])
+    report.new_models = exp.get("new_models", [])
+    report.total_battles = exp.get("total_battles", 0)
+    # 3. Check new models against official list
+    if official_models_path and os.path.isfile(official_models_path):
+        try:
+            with open(official_models_path, "r", encoding="utf-8") as f:
+                official_data = json.load(f)
+            official_models = set(
+                official_data.get("subsets", {})
+                .get(report.subset, {})
+                .get("models", [])
+            )
+            # Verify new_models are actually new
+            for model in report.new_models:
+                if model in official_models:
+                    report.add_check(
+                        f"Model '{model}' is new",
+                        False,
+                        "Model already exists in official leaderboard",
+                    )
+                else:
+                    report.add_check(f"Model '{model}' is new", True)
+        except Exception as e:
+            report.add_check(
+                "Check official models", False, f"Failed to load official models: {e}"
+            )
+    else:
+        report.add_check(
+            "Check official models",
+            True,
+            "Skipped (no official_models.json provided)",
+        )
+    # 4. Download and verify data from HuggingFace
+    if download_data:
+        data_report = validate_submission_data(submission)
+        for check in data_report.checks:
+            report.checks.append(check)
+            if not check.passed:
+                report.status = "failed"
+                if check.error:
+                    report.errors.append(f"{check.name}: {check.error}")
+        report.elo_comparison = data_report.elo_comparison
+    else:
+        report.add_check("Data verification", True, "Skipped (download_data=False)")
+    return report
+def validate_submission_data(submission: dict[str, Any]) -> ValidationReport:
+    """
+    Download and validate submission data from HuggingFace.
+    Downloads the pk_logs ZIP, verifies checksum, extracts battles,
+    and recalculates ELO for comparison.
+    Args:
+        submission: Submission metadata dictionary
+    Returns:
+        ValidationReport with data validation results
+    """
+    report = ValidationReport(status="success")
+    data_loc = submission.get("data_location", {})
+    hf_repo = data_loc.get("hf_repo_id", "")
+    hf_revision = data_loc.get("hf_revision", "main")
+    files = data_loc.get("files", {})
+    pk_logs_info = files.get("pk_logs_zip", {})
+    if not hf_repo or not pk_logs_info:
+        report.add_check("Data location", False, "Missing HF repo or file info")
+        return report
+    try:
+        from huggingface_hub import hf_hub_download
+    except ImportError:
+        report.add_check(
+            "HuggingFace Hub",
+            False,
+            "huggingface_hub not installed",
+        )
+        return report
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Download pk_logs ZIP
+        try:
+            pk_logs_path = hf_hub_download(
+                repo_id=hf_repo,
+                filename=pk_logs_info["path"],
+                repo_type="dataset",
+                revision=hf_revision,
+                local_dir=tmpdir,
+            )
+            report.add_check("Download pk_logs", True)
+        except Exception as e:
+            report.add_check("Download pk_logs", False, str(e))
+            return report
+        # Verify SHA256
+        expected_sha = pk_logs_info.get("sha256", "")
+        try:
+            with open(pk_logs_path, "rb") as f:
+                actual_sha = hashlib.sha256(f.read()).hexdigest()
+            if actual_sha == expected_sha:
+                report.add_check("SHA256 checksum", True)
+            else:
+                report.add_check(
+                    "SHA256 checksum",
+                    False,
+                    f"Expected {expected_sha[:16]}..., got {actual_sha[:16]}...",
+                )
+                return report
+        except Exception as e:
+            report.add_check("SHA256 checksum", False, str(e))
+            return report
+        # Extract ZIP
+        extract_dir = os.path.join(tmpdir, "extracted")
+        try:
+            with zipfile.ZipFile(pk_logs_path, "r") as zf:
+                zf.extractall(extract_dir)
+            report.add_check("Extract ZIP", True)
+        except Exception as e:
+            report.add_check("Extract ZIP", False, str(e))
+            return report
+        # Find battle log files
+        # The ZIP structure is: <exp_name>/*.jsonl
+        battle_records = []
+        try:
+            for root, dirs, filenames in os.walk(extract_dir):
+                for filename in filenames:
+                    if filename.endswith(".jsonl") and "raw_outputs" not in root:
+                        filepath = os.path.join(root, filename)
+                        with open(filepath, "r", encoding="utf-8") as f:
+                            for line in f:
+                                line = line.strip()
+                                if line:
+                                    try:
+                                        record = json.loads(line)
+                                        battle_records.append(record)
+                                    except json.JSONDecodeError:
+                                        continue
+            report.add_check("Parse battle logs", True)
+        except Exception as e:
+            report.add_check("Parse battle logs", False, str(e))
+            return report
+        # Verify battle count
+        expected_battles = submission.get("experiment", {}).get("total_battles", 0)
+        if len(battle_records) == expected_battles:
+            report.add_check("Battle count", True)
+        else:
+            report.add_check(
+                "Battle count",
+                False,
+                f"Expected {expected_battles}, got {len(battle_records)}",
+            )
+        # Recalculate ELO
+        try:
+            from genarena.bt_elo import compute_bt_elo_ratings
+            battles = [
+                (r["model_a"], r["model_b"], r["final_winner"])
+                for r in battle_records
+                if r.get("model_a") and r.get("model_b") and r.get("final_winner")
+            ]
+            if battles:
+                recalc_elo = compute_bt_elo_ratings(battles)
+                submitted_elo = submission.get("elo_preview", {}).get("ratings", {})
+                all_match = True
+                for model, submitted_rating in submitted_elo.items():
+                    recalc_rating = recalc_elo.get(model, 0)
+                    report.elo_comparison[model] = {
+                        "submitted": submitted_rating,
+                        "recalculated": recalc_rating,
+                    }
+                    # Allow small floating point differences (±1.0)
+                    diff = abs(submitted_rating - recalc_rating)
+                    if diff > 1.0:
+                        report.add_check(
+                            f"ELO '{model}'",
+                            False,
+                            f"Diff: {diff:.1f} (submitted: {submitted_rating:.1f}, "
+                            f"recalc: {recalc_rating:.1f})",
+                        )
+                        all_match = False
+                if all_match:
+                    report.add_check("ELO verification", True)
+        except Exception as e:
+            report.add_check("ELO verification", False, str(e))
+    return report

genarena/visualize/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""
+GenArena Arena Visualization Module.
+Provides a web-based interface for browsing and analyzing battle records.
+"""
+from genarena.visualize.app import create_app
+from genarena.visualize.data_loader import ArenaDataLoader
+__all__ = [
+    "create_app",
+    "ArenaDataLoader",
+]

genarena/visualize/app.py ADDED Viewed

	@@ -0,0 +1,934 @@

+"""Flask application for arena visualization."""
+import io
+import os
+from flask import Flask, jsonify, render_template, request, send_file, abort, redirect
+from genarena.visualize.data_loader import ArenaDataLoader
+def create_app(arena_dir: str, data_dir: str) -> Flask:
+    """
+    Create and configure the Flask application.
+    Args:
+        arena_dir: Path to arena directory
+        data_dir: Path to data directory
+    Returns:
+        Configured Flask app
+    """
+    # Get the directory containing this file for templates/static
+    app_dir = os.path.dirname(os.path.abspath(__file__))
+    app = Flask(
+        __name__,
+        template_folder=os.path.join(app_dir, "templates"),
+        static_folder=os.path.join(app_dir, "static"),
+    )
+    # Store paths in config
+    app.config["ARENA_DIR"] = arena_dir
+    app.config["DATA_DIR"] = data_dir
+    # Create data loader
+    data_loader = ArenaDataLoader(arena_dir, data_dir)
+    # ========== Page Routes ==========
+    @app.route("/")
+    def index():
+        """Main page."""
+        return render_template("index.html")
+    # ========== API Routes ==========
+    @app.route("/api/subsets")
+    def api_subsets():
+        """Get list of available subsets."""
+        subsets = data_loader.discover_subsets()
+        return jsonify({"subsets": subsets})
+    @app.route("/api/subsets/<subset>/info")
+    def api_subset_info(subset: str):
+        """Get information about a subset."""
+        info = data_loader.get_subset_info(subset)
+        if not info:
+            return jsonify({"error": "Subset not found"}), 404
+        return jsonify({
+            "name": info.name,
+            "models": info.models,
+            "experiments": info.experiments,
+            "total_battles": info.total_battles,
+            "min_input_images": info.min_input_images,
+            "max_input_images": info.max_input_images,
+            "prompt_sources": info.prompt_sources,
+        })
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles")
+    def api_battles(subset: str, exp_name: str):
+        """Get paginated battle records."""
+        # Parse query parameters
+        page = request.args.get("page", 1, type=int)
+        page_size = request.args.get("page_size", 20, type=int)
+        result_filter = request.args.get("result", None, type=str)
+        consistency = request.args.get("consistent", None, type=str)
+        min_images = request.args.get("min_images", None, type=int)
+        max_images = request.args.get("max_images", None, type=int)
+        prompt_source = request.args.get("prompt_source", None, type=str)
+        # Support multiple models (comma-separated or multiple params)
+        models_param = request.args.get("models", None, type=str)
+        models = None
+        if models_param:
+            models = [m.strip() for m in models_param.split(",") if m.strip()]
+        # Convert consistency filter
+        consistency_filter = None
+        if consistency == "true":
+            consistency_filter = True
+        elif consistency == "false":
+            consistency_filter = False
+        # Get battles
+        records, total = data_loader.get_battles(
+            subset=subset,
+            exp_name=exp_name,
+            page=page,
+            page_size=page_size,
+            models=models,
+            result_filter=result_filter,
+            consistency_filter=consistency_filter,
+            min_images=min_images,
+            max_images=max_images,
+            prompt_source=prompt_source,
+        )
+        return jsonify({
+            "battles": [r.to_dict() for r in records],
+            "total": total,
+            "page": page,
+            "page_size": page_size,
+            "total_pages": (total + page_size - 1) // page_size,
+        })
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles/<path:battle_id>")
+    def api_battle_detail(subset: str, exp_name: str, battle_id: str):
+        """Get detailed battle record."""
+        # Parse battle_id: model_a_vs_model_b:sample_index
+        try:
+            parts = battle_id.rsplit(":", 1)
+            sample_index = int(parts[1])
+            model_part = parts[0]
+            # Split model names
+            if "_vs_" in model_part:
+                models = model_part.split("_vs_")
+                model_a, model_b = models[0], models[1]
+            else:
+                return jsonify({"error": "Invalid battle_id format"}), 400
+        except (ValueError, IndexError):
+            return jsonify({"error": "Invalid battle_id format"}), 400
+        record = data_loader.get_battle_detail(
+            subset, exp_name, model_a, model_b, sample_index
+        )
+        if not record:
+            return jsonify({"error": "Battle not found"}), 404
+        return jsonify(record.to_detail_dict())
+    @app.route("/api/subsets/<subset>/stats")
+    def api_stats(subset: str):
+        """Get statistics for a subset."""
+        exp_name = request.args.get("exp_name", None, type=str)
+        stats = data_loader.get_stats(subset, exp_name)
+        if not stats:
+            return jsonify({"error": "Subset not found"}), 404
+        return jsonify(stats)
+    @app.route("/api/subsets/<subset>/leaderboard")
+    def api_elo_leaderboard(subset: str):
+        """Get ELO leaderboard for a subset."""
+        # Support multiple models filter (comma-separated)
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        leaderboard = data_loader.get_elo_leaderboard(subset, filter_models)
+        return jsonify({"leaderboard": leaderboard})
+    @app.route("/api/subsets/<subset>/models/<path:model>/stats")
+    def api_model_stats(subset: str, model: str):
+        """Get detailed statistics for a specific model including win rates against all opponents."""
+        exp_name = request.args.get("exp_name", "__all__", type=str)
+        stats = data_loader.get_model_vs_stats(subset, model, exp_name)
+        if not stats:
+            return jsonify({"error": "Model not found"}), 404
+        return jsonify(stats)
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/h2h")
+    def api_head_to_head(subset: str, exp_name: str):
+        """Get head-to-head statistics between two models."""
+        model_a = request.args.get("model_a", None, type=str)
+        model_b = request.args.get("model_b", None, type=str)
+        if not model_a or not model_b:
+            return jsonify({"error": "model_a and model_b are required"}), 400
+        h2h = data_loader.get_head_to_head(subset, exp_name, model_a, model_b)
+        return jsonify(h2h)
+    @app.route("/api/subsets/<subset>/samples/<int:sample_index>/input_count")
+    def api_input_image_count(subset: str, sample_index: int):
+        """Get the number of input images for a sample."""
+        count = data_loader.get_input_image_count(subset, sample_index)
+        return jsonify({"count": count})
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/all_models")
+    def api_sample_all_models(subset: str, exp_name: str, sample_index: int):
+        """Get all model outputs for a specific sample, sorted by win rate."""
+        # Support multiple models filter (comma-separated)
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        # stats_scope: 'filtered' = only count battles between filtered models
+        #              'all' = count all battles (but show only filtered models)
+        stats_scope = request.args.get("stats_scope", "filtered", type=str)
+        result = data_loader.get_sample_all_models(
+            subset, exp_name, sample_index, filter_models, stats_scope
+        )
+        if not result:
+            return jsonify({"error": "Sample not found"}), 404
+        return jsonify(result)
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/models/<path:model>/battles")
+    def api_model_battles_for_sample(subset: str, exp_name: str, sample_index: int, model: str):
+        """Get all battle records for a specific model on a specific sample."""
+        # Parse optional opponent models filter (comma-separated)
+        opponents_param = request.args.get("opponents", None, type=str)
+        opponent_models = None
+        if opponents_param:
+            opponent_models = [m.strip() for m in opponents_param.split(",") if m.strip()]
+        result = data_loader.get_model_battles_for_sample(
+            subset=subset,
+            exp_name=exp_name,
+            sample_index=sample_index,
+            model=model,
+            opponent_models=opponent_models,
+        )
+        return jsonify(result)
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/prompts")
+    def api_prompts(subset: str, exp_name: str):
+        """Get paginated list of prompts/samples with all model outputs."""
+        # Parse query parameters
+        page = request.args.get("page", 1, type=int)
+        page_size = request.args.get("page_size", 10, type=int)
+        min_images = request.args.get("min_images", None, type=int)
+        max_images = request.args.get("max_images", None, type=int)
+        prompt_source = request.args.get("prompt_source", None, type=str)
+        # Support multiple models filter (comma-separated)
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        # Get prompts
+        prompts, total = data_loader.get_prompts(
+            subset=subset,
+            exp_name=exp_name,
+            page=page,
+            page_size=page_size,
+            min_images=min_images,
+            max_images=max_images,
+            prompt_source=prompt_source,
+            filter_models=filter_models,
+        )
+        return jsonify({
+            "prompts": prompts,
+            "total": total,
+            "page": page,
+            "page_size": page_size,
+            "total_pages": (total + page_size - 1) // page_size,
+        })
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/search")
+    def api_search(subset: str, exp_name: str):
+        """Search battles by text query (full-text search across instruction, task_type, prompt_source, metadata)."""
+        # Parse query parameters
+        query = request.args.get("q", "", type=str)
+        page = request.args.get("page", 1, type=int)
+        page_size = request.args.get("page_size", 20, type=int)
+        consistency = request.args.get("consistent", None, type=str)
+        # Support multiple models (comma-separated)
+        models_param = request.args.get("models", None, type=str)
+        models = None
+        if models_param:
+            models = [m.strip() for m in models_param.split(",") if m.strip()]
+        # Convert consistency filter
+        consistency_filter = None
+        if consistency == "true":
+            consistency_filter = True
+        elif consistency == "false":
+            consistency_filter = False
+        # Search battles
+        records, total = data_loader.search_battles(
+            subset=subset,
+            exp_name=exp_name,
+            query=query,
+            page=page,
+            page_size=page_size,
+            models=models,
+            consistency_filter=consistency_filter,
+        )
+        return jsonify({
+            "battles": [r.to_dict() for r in records],
+            "total": total,
+            "page": page,
+            "page_size": page_size,
+            "total_pages": (total + page_size - 1) // page_size,
+            "query": query,
+        })
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/search/prompts")
+    def api_search_prompts(subset: str, exp_name: str):
+        """Search prompts by text query."""
+        # Parse query parameters
+        query = request.args.get("q", "", type=str)
+        page = request.args.get("page", 1, type=int)
+        page_size = request.args.get("page_size", 10, type=int)
+        # Support multiple models filter (comma-separated)
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        # Search prompts
+        prompts, total = data_loader.search_prompts(
+            subset=subset,
+            exp_name=exp_name,
+            query=query,
+            page=page,
+            page_size=page_size,
+            filter_models=filter_models,
+        )
+        return jsonify({
+            "prompts": prompts,
+            "total": total,
+            "page": page,
+            "page_size": page_size,
+            "total_pages": (total + page_size - 1) // page_size,
+            "query": query,
+        })
+    @app.route("/api/subsets/<subset>/matrix")
+    def api_win_rate_matrix(subset: str):
+        """Get win rate matrix for all model pairs."""
+        exp_name = request.args.get("exp_name", "__all__", type=str)
+        # Support model filter (comma-separated)
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        result = data_loader.get_win_rate_matrix(subset, exp_name, filter_models)
+        return jsonify(result)
+    @app.route("/api/subsets/<subset>/leaderboard/by-source")
+    def api_elo_by_source(subset: str):
+        """Get ELO rankings grouped by prompt source."""
+        exp_name = request.args.get("exp_name", "__all__", type=str)
+        result = data_loader.get_elo_by_source(subset, exp_name)
+        return jsonify(result)
+    @app.route("/api/subsets/<subset>/elo-history")
+    def api_elo_history(subset: str):
+        """Get ELO history over time."""
+        exp_name = request.args.get("exp_name", "__all__", type=str)
+        granularity = request.args.get("granularity", "day", type=str)
+        # Support model filter (comma-separated)
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        result = data_loader.get_elo_history(subset, exp_name, granularity, filter_models)
+        return jsonify(result)
+    @app.route("/api/overview/leaderboards")
+    def api_overview_leaderboards():
+        """Get leaderboard data for all subsets (for Overview page)."""
+        result = data_loader.get_all_subsets_leaderboards()
+        return jsonify(result)
+    @app.route("/api/cross-subset/info")
+    def api_cross_subset_info():
+        """Get information about models across multiple subsets."""
+        subsets_param = request.args.get("subsets", "", type=str)
+        if not subsets_param:
+            return jsonify({"error": "subsets parameter is required"}), 400
+        subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
+        if len(subsets) < 1:
+            return jsonify({"error": "At least 1 subset required"}), 400
+        result = data_loader.get_cross_subset_info(subsets)
+        return jsonify(result)
+    @app.route("/api/cross-subset/elo")
+    def api_cross_subset_elo():
+        """Compute ELO rankings across multiple subsets."""
+        subsets_param = request.args.get("subsets", "", type=str)
+        if not subsets_param:
+            return jsonify({"error": "subsets parameter is required"}), 400
+        subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
+        if len(subsets) < 1:
+            return jsonify({"error": "At least 1 subset required"}), 400
+        exp_name = request.args.get("exp_name", "__all__", type=str)
+        model_scope = request.args.get("model_scope", "all", type=str)
+        result = data_loader.get_cross_subset_elo(subsets, exp_name, model_scope)
+        return jsonify(result)
+    # ========== Image Routes ==========
+    @app.route("/images/<subset>/<model>/<int:sample_index>")
+    def serve_model_image(subset: str, model: str, sample_index: int):
+        """Serve model output image."""
+        image_path = data_loader.get_image_path(subset, model, sample_index)
+        if not image_path or not os.path.isfile(image_path):
+            abort(404)
+        # Determine mime type
+        ext = os.path.splitext(image_path)[1].lower()
+        mime_types = {
+            ".png": "image/png",
+            ".jpg": "image/jpeg",
+            ".jpeg": "image/jpeg",
+            ".webp": "image/webp",
+        }
+        mimetype = mime_types.get(ext, "image/png")
+        return send_file(
+            image_path,
+            mimetype=mimetype,
+            max_age=3600,  # Cache for 1 hour
+        )
+    @app.route("/images/<subset>/input/<int:sample_index>")
+    @app.route("/images/<subset>/input/<int:sample_index>/<int:img_idx>")
+    def serve_input_image(subset: str, sample_index: int, img_idx: int = 0):
+        """Serve input image from parquet dataset. Supports multiple images via img_idx."""
+        image_bytes = data_loader.get_input_image_by_idx(subset, sample_index, img_idx)
+        if not image_bytes:
+            abort(404)
+        return send_file(
+            io.BytesIO(image_bytes),
+            mimetype="image/png",
+            max_age=3600,
+        )
+    return app
+def run_server(
+    arena_dir: str,
+    data_dir: str,
+    host: str = "0.0.0.0",
+    port: int = 8080,
+    debug: bool = False,
+):
+    """
+    Run the visualization server.
+    Args:
+        arena_dir: Path to arena directory
+        data_dir: Path to data directory
+        host: Host to bind to
+        port: Port to listen on
+        debug: Enable debug mode
+    """
+    import logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        datefmt="%H:%M:%S"
+    )
+    print(f"\n{'='*60}")
+    print(f"  GenArena Arena Visualizer")
+    print(f"{'='*60}")
+    print(f"  Arena Dir: {arena_dir}")
+    print(f"  Data Dir:  {data_dir}")
+    print(f"{'='*60}")
+    print(f"  Preloading data (this may take a while)...")
+    print(f"{'='*60}\n")
+    app = create_app(arena_dir, data_dir)
+    print(f"\n{'='*60}")
+    print(f"  Server ready: http://{host}:{port}")
+    print(f"{'='*60}\n")
+    app.run(host=host, port=port, debug=debug, threaded=True)
+def create_hf_app(
+    arena_dir: str,
+    data_dir: str,
+    hf_repo: str,
+    image_files: list[str],
+) -> Flask:
+    """
+    Create Flask app for HuggingFace Spaces deployment.
+    This version uses HF CDN URLs for model output images instead of
+    serving them from local filesystem.
+    Args:
+        arena_dir: Path to arena directory (metadata only, no images)
+        data_dir: Path to data directory containing parquet files
+        hf_repo: HuggingFace repo ID for image CDN URLs
+        image_files: List of image file paths in the HF repo
+    Returns:
+        Configured Flask app for HF Spaces
+    """
+    from genarena.visualize.data_loader import HFArenaDataLoader
+    # Get the directory containing this file for templates/static
+    app_dir = os.path.dirname(os.path.abspath(__file__))
+    app = Flask(
+        __name__,
+        template_folder=os.path.join(app_dir, "templates"),
+        static_folder=os.path.join(app_dir, "static"),
+    )
+    # Store config
+    app.config["ARENA_DIR"] = arena_dir
+    app.config["DATA_DIR"] = data_dir
+    app.config["USE_HF_CDN"] = True
+    app.config["HF_REPO"] = hf_repo
+    # Create HF data loader
+    data_loader = HFArenaDataLoader(arena_dir, data_dir, hf_repo, image_files)
+    # ========== Page Routes ==========
+    @app.route("/")
+    def index():
+        """Main page."""
+        return render_template("index.html")
+    # ========== API Routes ==========
+    # Copy all API routes from create_app - they work the same way
+    @app.route("/api/subsets")
+    def api_subsets():
+        """Get list of available subsets."""
+        subsets = data_loader.discover_subsets()
+        return jsonify({"subsets": subsets})
+    @app.route("/api/subsets/<subset>/info")
+    def api_subset_info(subset: str):
+        """Get information about a subset."""
+        info = data_loader.get_subset_info(subset)
+        if not info:
+            return jsonify({"error": "Subset not found"}), 404
+        return jsonify({
+            "name": info.name,
+            "models": info.models,
+            "experiments": info.experiments,
+            "total_battles": info.total_battles,
+            "min_input_images": info.min_input_images,
+            "max_input_images": info.max_input_images,
+            "prompt_sources": info.prompt_sources,
+        })
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles")
+    def api_battles(subset: str, exp_name: str):
+        """Get paginated battle records."""
+        page = request.args.get("page", 1, type=int)
+        page_size = request.args.get("page_size", 20, type=int)
+        result_filter = request.args.get("result", None, type=str)
+        consistency = request.args.get("consistent", None, type=str)
+        min_images = request.args.get("min_images", None, type=int)
+        max_images = request.args.get("max_images", None, type=int)
+        prompt_source = request.args.get("prompt_source", None, type=str)
+        models_param = request.args.get("models", None, type=str)
+        models = None
+        if models_param:
+            models = [m.strip() for m in models_param.split(",") if m.strip()]
+        consistency_filter = None
+        if consistency == "true":
+            consistency_filter = True
+        elif consistency == "false":
+            consistency_filter = False
+        records, total = data_loader.get_battles(
+            subset=subset,
+            exp_name=exp_name,
+            page=page,
+            page_size=page_size,
+            models=models,
+            result_filter=result_filter,
+            consistency_filter=consistency_filter,
+            min_images=min_images,
+            max_images=max_images,
+            prompt_source=prompt_source,
+        )
+        return jsonify({
+            "battles": [r.to_dict() for r in records],
+            "total": total,
+            "page": page,
+            "page_size": page_size,
+            "total_pages": (total + page_size - 1) // page_size,
+        })
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles/<path:battle_id>")
+    def api_battle_detail(subset: str, exp_name: str, battle_id: str):
+        """Get detailed battle record."""
+        try:
+            parts = battle_id.rsplit(":", 1)
+            sample_index = int(parts[1])
+            model_part = parts[0]
+            if "_vs_" in model_part:
+                models = model_part.split("_vs_")
+                model_a, model_b = models[0], models[1]
+            else:
+                return jsonify({"error": "Invalid battle_id format"}), 400
+        except (ValueError, IndexError):
+            return jsonify({"error": "Invalid battle_id format"}), 400
+        record = data_loader.get_battle_detail(
+            subset, exp_name, model_a, model_b, sample_index
+        )
+        if not record:
+            return jsonify({"error": "Battle not found"}), 404
+        return jsonify(record.to_detail_dict())
+    @app.route("/api/subsets/<subset>/stats")
+    def api_stats(subset: str):
+        """Get statistics for a subset."""
+        exp_name = request.args.get("exp_name", None, type=str)
+        stats = data_loader.get_stats(subset, exp_name)
+        if not stats:
+            return jsonify({"error": "Subset not found"}), 404
+        return jsonify(stats)
+    @app.route("/api/subsets/<subset>/leaderboard")
+    def api_elo_leaderboard(subset: str):
+        """Get ELO leaderboard for a subset."""
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        leaderboard = data_loader.get_elo_leaderboard(subset, filter_models)
+        return jsonify({"leaderboard": leaderboard})
+    @app.route("/api/subsets/<subset>/models/<path:model>/stats")
+    def api_model_stats(subset: str, model: str):
+        """Get detailed statistics for a specific model."""
+        exp_name = request.args.get("exp_name", "__all__", type=str)
+        stats = data_loader.get_model_vs_stats(subset, model, exp_name)
+        if not stats:
+            return jsonify({"error": "Model not found"}), 404
+        return jsonify(stats)
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/h2h")
+    def api_head_to_head(subset: str, exp_name: str):
+        """Get head-to-head statistics between two models."""
+        model_a = request.args.get("model_a", None, type=str)
+        model_b = request.args.get("model_b", None, type=str)
+        if not model_a or not model_b:
+            return jsonify({"error": "model_a and model_b are required"}), 400
+        h2h = data_loader.get_head_to_head(subset, exp_name, model_a, model_b)
+        return jsonify(h2h)
+    @app.route("/api/subsets/<subset>/samples/<int:sample_index>/input_count")
+    def api_input_image_count(subset: str, sample_index: int):
+        """Get the number of input images for a sample."""
+        count = data_loader.get_input_image_count(subset, sample_index)
+        return jsonify({"count": count})
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/all_models")
+    def api_sample_all_models(subset: str, exp_name: str, sample_index: int):
+        """Get all model outputs for a specific sample."""
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        stats_scope = request.args.get("stats_scope", "filtered", type=str)
+        result = data_loader.get_sample_all_models(
+            subset, exp_name, sample_index, filter_models, stats_scope
+        )
+        if not result:
+            return jsonify({"error": "Sample not found"}), 404
+        return jsonify(result)
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/models/<path:model>/battles")
+    def api_model_battles_for_sample(subset: str, exp_name: str, sample_index: int, model: str):
+        """Get all battle records for a specific model on a specific sample."""
+        opponents_param = request.args.get("opponents", None, type=str)
+        opponent_models = None
+        if opponents_param:
+            opponent_models = [m.strip() for m in opponents_param.split(",") if m.strip()]
+        result = data_loader.get_model_battles_for_sample(
+            subset=subset,
+            exp_name=exp_name,
+            sample_index=sample_index,
+            model=model,
+            opponent_models=opponent_models,
+        )
+        return jsonify(result)
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/prompts")
+    def api_prompts(subset: str, exp_name: str):
+        """Get paginated list of prompts/samples."""
+        page = request.args.get("page", 1, type=int)
+        page_size = request.args.get("page_size", 10, type=int)
+        min_images = request.args.get("min_images", None, type=int)
+        max_images = request.args.get("max_images", None, type=int)
+        prompt_source = request.args.get("prompt_source", None, type=str)
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        prompts, total = data_loader.get_prompts(
+            subset=subset,
+            exp_name=exp_name,
+            page=page,
+            page_size=page_size,
+            min_images=min_images,
+            max_images=max_images,
+            prompt_source=prompt_source,
+            filter_models=filter_models,
+        )
+        return jsonify({
+            "prompts": prompts,
+            "total": total,
+            "page": page,
+            "page_size": page_size,
+            "total_pages": (total + page_size - 1) // page_size,
+        })
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/search")
+    def api_search(subset: str, exp_name: str):
+        """Search battles by text query."""
+        query = request.args.get("q", "", type=str)
+        page = request.args.get("page", 1, type=int)
+        page_size = request.args.get("page_size", 20, type=int)
+        consistency = request.args.get("consistent", None, type=str)
+        models_param = request.args.get("models", None, type=str)
+        models = None
+        if models_param:
+            models = [m.strip() for m in models_param.split(",") if m.strip()]
+        consistency_filter = None
+        if consistency == "true":
+            consistency_filter = True
+        elif consistency == "false":
+            consistency_filter = False
+        records, total = data_loader.search_battles(
+            subset=subset,
+            exp_name=exp_name,
+            query=query,
+            page=page,
+            page_size=page_size,
+            models=models,
+            consistency_filter=consistency_filter,
+        )
+        return jsonify({
+            "battles": [r.to_dict() for r in records],
+            "total": total,
+            "page": page,
+            "page_size": page_size,
+            "total_pages": (total + page_size - 1) // page_size,
+            "query": query,
+        })
+    @app.route("/api/subsets/<subset>/experiments/<exp_name>/search/prompts")
+    def api_search_prompts(subset: str, exp_name: str):
+        """Search prompts by text query."""
+        query = request.args.get("q", "", type=str)
+        page = request.args.get("page", 1, type=int)
+        page_size = request.args.get("page_size", 10, type=int)
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        prompts, total = data_loader.search_prompts(
+            subset=subset,
+            exp_name=exp_name,
+            query=query,
+            page=page,
+            page_size=page_size,
+            filter_models=filter_models,
+        )
+        return jsonify({
+            "prompts": prompts,
+            "total": total,
+            "page": page,
+            "page_size": page_size,
+            "total_pages": (total + page_size - 1) // page_size,
+            "query": query,
+        })
+    @app.route("/api/subsets/<subset>/matrix")
+    def api_win_rate_matrix(subset: str):
+        """Get win rate matrix for all model pairs."""
+        exp_name = request.args.get("exp_name", "__all__", type=str)
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        result = data_loader.get_win_rate_matrix(subset, exp_name, filter_models)
+        return jsonify(result)
+    @app.route("/api/subsets/<subset>/leaderboard/by-source")
+    def api_elo_by_source(subset: str):
+        """Get ELO rankings grouped by prompt source."""
+        exp_name = request.args.get("exp_name", "__all__", type=str)
+        result = data_loader.get_elo_by_source(subset, exp_name)
+        return jsonify(result)
+    @app.route("/api/subsets/<subset>/elo-history")
+    def api_elo_history(subset: str):
+        """Get ELO history over time."""
+        exp_name = request.args.get("exp_name", "__all__", type=str)
+        granularity = request.args.get("granularity", "day", type=str)
+        models_param = request.args.get("models", None, type=str)
+        filter_models = None
+        if models_param:
+            filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
+        result = data_loader.get_elo_history(subset, exp_name, granularity, filter_models)
+        return jsonify(result)
+    @app.route("/api/overview/leaderboards")
+    def api_overview_leaderboards():
+        """Get leaderboard data for all subsets."""
+        result = data_loader.get_all_subsets_leaderboards()
+        return jsonify(result)
+    @app.route("/api/cross-subset/info")
+    def api_cross_subset_info():
+        """Get information about models across multiple subsets."""
+        subsets_param = request.args.get("subsets", "", type=str)
+        if not subsets_param:
+            return jsonify({"error": "subsets parameter is required"}), 400
+        subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
+        if len(subsets) < 1:
+            return jsonify({"error": "At least 1 subset required"}), 400
+        result = data_loader.get_cross_subset_info(subsets)
+        return jsonify(result)
+    @app.route("/api/cross-subset/elo")
+    def api_cross_subset_elo():
+        """Compute ELO rankings across multiple subsets."""
+        subsets_param = request.args.get("subsets", "", type=str)
+        if not subsets_param:
+            return jsonify({"error": "subsets parameter is required"}), 400
+        subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
+        if len(subsets) < 1:
+            return jsonify({"error": "At least 1 subset required"}), 400
+        exp_name = request.args.get("exp_name", "__all__", type=str)
+        model_scope = request.args.get("model_scope", "all", type=str)
+        result = data_loader.get_cross_subset_elo(subsets, exp_name, model_scope)
+        return jsonify(result)
+    # ========== Image Routes ==========
+    @app.route("/images/<subset>/<model>/<int:sample_index>")
+    def serve_model_image(subset: str, model: str, sample_index: int):
+        """Redirect to HF CDN for model output images."""
+        url = data_loader.get_model_image_url(subset, model, sample_index)
+        if url:
+            return redirect(url)
+        abort(404)
+    @app.route("/images/<subset>/input/<int:sample_index>")
+    @app.route("/images/<subset>/input/<int:sample_index>/<int:img_idx>")
+    def serve_input_image(subset: str, sample_index: int, img_idx: int = 0):
+        """Serve input image from parquet dataset."""
+        image_bytes = data_loader.get_input_image_by_idx(subset, sample_index, img_idx)
+        if not image_bytes:
+            abort(404)
+        return send_file(
+            io.BytesIO(image_bytes),
+            mimetype="image/png",
+            max_age=3600,
+        )
+    return app

genarena/visualize/data_loader.py ADDED Viewed

	@@ -0,0 +1,2331 @@

+"""Data loader for arena visualization with preloading support."""
+import json
+import logging
+import os
+import re
+from dataclasses import dataclass, field
+from typing import Any, Optional
+from genarena.data import DataSample, ParquetDataset, discover_subsets
+from genarena.models import GlobalModelOutputManager
+from genarena.state import ArenaState, load_state
+logger = logging.getLogger(__name__)
+@dataclass
+class BattleRecord:
+    """A single battle record with all relevant information."""
+    # Battle identification
+    subset: str
+    exp_name: str
+    sample_index: int
+    model_a: str
+    model_b: str
+    # Battle result
+    final_winner: str  # model name or "tie"
+    is_consistent: bool
+    timestamp: str = ""
+    # Raw VLM outputs (from audit logs, optional)
+    original_call: Optional[dict[str, Any]] = None
+    swapped_call: Optional[dict[str, Any]] = None
+    # Sample data (loaded on demand)
+    instruction: str = ""
+    task_type: str = ""
+    input_image_count: int = 1
+    prompt_source: Optional[str] = None
+    original_metadata: Optional[dict[str, Any]] = None
+    @property
+    def id(self) -> str:
+        """Unique identifier for this battle."""
+        return f"{self.subset}:{self.exp_name}:{self.model_a}_vs_{self.model_b}:{self.sample_index}"
+    @property
+    def winner_display(self) -> str:
+        """Display-friendly winner string."""
+        if self.final_winner == "tie":
+            return "Tie"
+        return self.final_winner
+    @property
+    def models(self) -> set[str]:
+        """Set of models involved in this battle."""
+        return {self.model_a, self.model_b}
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "id": self.id,
+            "subset": self.subset,
+            "exp_name": self.exp_name,
+            "sample_index": self.sample_index,
+            "model_a": self.model_a,
+            "model_b": self.model_b,
+            "final_winner": self.final_winner,
+            "winner_display": self.winner_display,
+            "is_consistent": self.is_consistent,
+            "timestamp": self.timestamp,
+            "instruction": self.instruction,
+            "task_type": self.task_type,
+            "input_image_count": self.input_image_count,
+            "prompt_source": self.prompt_source,
+            "original_metadata": self.original_metadata,
+            "has_audit": self.original_call is not None,
+        }
+    def to_detail_dict(self) -> dict[str, Any]:
+        """Convert to detailed dictionary including VLM outputs."""
+        d = self.to_dict()
+        d["original_call"] = self.original_call
+        d["swapped_call"] = self.swapped_call
+        return d
+@dataclass
+class SubsetInfo:
+    """Information about a subset."""
+    name: str
+    models: list[str]
+    experiments: list[str]
+    total_battles: int
+    state: Optional[ArenaState] = None
+    min_input_images: int = 1
+    max_input_images: int = 1
+    prompt_sources: list[str] = field(default_factory=list)
+class ArenaDataLoader:
+    """
+    Data loader for arena visualization.
+    Manages loading and querying battle records across multiple subsets.
+    Supports preloading for better performance with large datasets.
+    """
+    def __init__(self, arena_dir: str, data_dir: str, preload: bool = True):
+        """
+        Initialize the data loader.
+        Args:
+            arena_dir: Path to arena directory containing subset folders
+            data_dir: Path to data directory containing parquet files
+            preload: If True, preload all data at initialization
+        """
+        self.arena_dir = arena_dir
+        self.data_dir = data_dir
+        # Cached data
+        self._subsets: Optional[list[str]] = None
+        self._subset_info_cache: dict[str, SubsetInfo] = {}
+        self._dataset_cache: dict[str, ParquetDataset] = {}
+        self._model_manager_cache: dict[str, GlobalModelOutputManager] = {}
+        # Battle records cache: (subset, exp_name) -> List[BattleRecord]
+        self._battle_cache: dict[tuple[str, str], list[BattleRecord]] = {}
+        # Index for faster lookups: (subset, exp_name) -> {model -> [record_indices]}
+        self._model_index: dict[tuple[str, str], dict[str, list[int]]] = {}
+        # Sample data cache: (subset, sample_index) -> SampleMetadata dict
+        self._sample_cache: dict[tuple[str, int], dict[str, Any]] = {}
+        # Sample to parquet file mapping: (subset, sample_index) -> parquet_file_path
+        self._sample_file_map: dict[tuple[str, int], str] = {}
+        # Input image count range per subset: subset -> (min_count, max_count)
+        self._image_count_range: dict[str, tuple[int, int]] = {}
+        # Prompt sources per subset: subset -> list of unique prompt_source values
+        self._prompt_sources: dict[str, list[str]] = {}
+        # Audit logs cache: (subset, exp_name, model_a, model_b, sample_index) -> audit data
+        self._audit_cache: dict[tuple[str, str, str, str, int], dict[str, Any]] = {}
+        # Cross-subset ELO cache: (sorted_subsets_tuple, exp_name, model_scope) -> result dict
+        self._cross_subset_elo_cache: dict[tuple[tuple[str, ...], str, str], dict[str, Any]] = {}
+        if preload:
+            self._preload_all()
+    def _preload_all(self) -> None:
+        """Preload all data at initialization for better performance."""
+        logger.info("Preloading arena data...")
+        subsets = self.discover_subsets()
+        logger.info(f"Found {len(subsets)} subsets: {subsets}")
+        for subset in subsets:
+            logger.info(f"Loading subset: {subset}")
+            # Preload parquet dataset
+            self._preload_dataset(subset)
+            # Load subset info (models, experiments)
+            info = self.get_subset_info(subset)
+            if info:
+                logger.info(f"  - {len(info.models)} models, {len(info.experiments)} experiments")
+                # Preload battle logs for each experiment
+                for exp_name in info.experiments:
+                    records = self._load_battle_logs(subset, exp_name)
+                    logger.info(f"  - Experiment '{exp_name}': {len(records)} battles")
+        logger.info("Preloading complete!")
+    def _preload_dataset(self, subset: str) -> None:
+        """
+        Preload sample text data (instruction, task_type) using pyarrow directly.
+        This is much faster than using HuggingFace datasets because we skip
+        decoding image columns. Images are loaded on-demand when requested.
+        """
+        import pyarrow.parquet as pq
+        subset_path = os.path.join(self.data_dir, subset)
+        if not os.path.isdir(subset_path):
+            return
+        # Find parquet files
+        parquet_files = sorted([
+            os.path.join(subset_path, f)
+            for f in os.listdir(subset_path)
+            if f.startswith("data-") and f.endswith(".parquet")
+        ])
+        if not parquet_files:
+            return
+        logger.info(f"  - Loading metadata from parquet (fast mode)...")
+        # Read all metadata columns + input_images (only to count, not decode)
+        columns_to_read = ["index", "instruction", "task_type", "input_images", "prompt_source", "original_metadata"]
+        total_rows = 0
+        min_img_count = float('inf')
+        max_img_count = 0
+        prompt_sources_set: set[str] = set()
+        for pf in parquet_files:
+            try:
+                # Get available columns in this file
+                import pyarrow.parquet as pq_schema
+                schema = pq.read_schema(pf)
+                available_columns = [c for c in columns_to_read if c in schema.names]
+                # Read the columns we need
+                table = pq.read_table(pf, columns=available_columns)
+                # Extract columns with defaults
+                def get_column(name, default=None):
+                    if name in table.column_names:
+                        return table.column(name).to_pylist()
+                    return [default] * table.num_rows
+                indices = get_column("index", 0)
+                instructions = get_column("instruction", "")
+                task_types = get_column("task_type", "")
+                prompt_sources = get_column("prompt_source", None)
+                original_metadatas = get_column("original_metadata", None)
+                # Handle input_images separately for counting
+                has_input_images = "input_images" in table.column_names
+                input_images_col = table.column("input_images") if has_input_images else None
+                for i, idx in enumerate(indices):
+                    idx = int(idx) if idx is not None else i
+                    # Count input images without decoding
+                    img_count = 0
+                    if input_images_col is not None:
+                        img_list = input_images_col[i].as_py()
+                        img_count = len(img_list) if img_list else 0
+                    min_img_count = min(min_img_count, img_count) if img_count > 0 else min_img_count
+                    max_img_count = max(max_img_count, img_count)
+                    # Track prompt sources
+                    ps = prompt_sources[i] if prompt_sources[i] else None
+                    if ps:
+                        prompt_sources_set.add(str(ps))
+                    # Build metadata dict
+                    metadata = {
+                        "instruction": str(instructions[i]) if instructions[i] else "",
+                        "task_type": str(task_types[i]) if task_types[i] else "",
+                        "input_image_count": img_count,
+                        "prompt_source": ps,
+                        "original_metadata": original_metadatas[i] if original_metadatas[i] else None,
+                    }
+                    self._sample_cache[(subset, idx)] = metadata
+                    self._sample_file_map[(subset, idx)] = pf
+                    total_rows += 1
+            except Exception as e:
+                logger.warning(f"Failed to read {pf}: {e}")
+                continue
+        # Store image count range for this subset
+        if total_rows > 0:
+            self._image_count_range[subset] = (
+                min_img_count if min_img_count != float('inf') else 1,
+                max_img_count if max_img_count > 0 else 1
+            )
+        # Store prompt sources for this subset
+        self._prompt_sources[subset] = sorted(prompt_sources_set)
+        logger.info(f"  - Cached {total_rows} samples (input images: {self._image_count_range.get(subset, (1,1))}, sources: {len(prompt_sources_set)})")
+    def discover_subsets(self) -> list[str]:
+        """
+        Discover all available subsets.
+        A valid subset must exist in both arena_dir (with pk_logs) and data_dir.
+        Returns:
+            List of subset names
+        """
+        if self._subsets is not None:
+            return self._subsets
+        # Get subsets from data_dir (have parquet files)
+        data_subsets = set(discover_subsets(self.data_dir))
+        # Get subsets from arena_dir (have pk_logs)
+        arena_subsets = set()
+        if os.path.isdir(self.arena_dir):
+            for name in os.listdir(self.arena_dir):
+                subset_path = os.path.join(self.arena_dir, name)
+                pk_logs_path = os.path.join(subset_path, "pk_logs")
+                if os.path.isdir(pk_logs_path):
+                    # Check if there are any experiment directories with battle logs
+                    for exp_name in os.listdir(pk_logs_path):
+                        exp_path = os.path.join(pk_logs_path, exp_name)
+                        if os.path.isdir(exp_path):
+                            # Check for .jsonl files
+                            has_logs = any(
+                                f.endswith(".jsonl")
+                                for f in os.listdir(exp_path)
+                                if os.path.isfile(os.path.join(exp_path, f))
+                            )
+                            if has_logs:
+                                arena_subsets.add(name)
+                                break
+        # Intersection: must have both data and battle logs
+        valid_subsets = sorted(data_subsets & arena_subsets)
+        self._subsets = valid_subsets
+        return valid_subsets
+    def get_subset_info(self, subset: str) -> Optional[SubsetInfo]:
+        """
+        Get information about a subset.
+        Args:
+            subset: Subset name
+        Returns:
+            SubsetInfo or None if subset doesn't exist
+        """
+        if subset in self._subset_info_cache:
+            return self._subset_info_cache[subset]
+        subset_path = os.path.join(self.arena_dir, subset)
+        if not os.path.isdir(subset_path):
+            return None
+        # Get models
+        model_manager = self._get_model_manager(subset)
+        models = model_manager.models if model_manager else []
+        # Get experiments
+        pk_logs_dir = os.path.join(subset_path, "pk_logs")
+        experiments = []
+        if os.path.isdir(pk_logs_dir):
+            for name in os.listdir(pk_logs_dir):
+                exp_path = os.path.join(pk_logs_dir, name)
+                if os.path.isdir(exp_path):
+                    # Check for battle logs
+                    has_logs = any(
+                        f.endswith(".jsonl")
+                        for f in os.listdir(exp_path)
+                        if os.path.isfile(os.path.join(exp_path, f))
+                    )
+                    if has_logs:
+                        experiments.append(name)
+        experiments.sort()
+        # Load state
+        state_path = os.path.join(subset_path, "arena", "state.json")
+        state = load_state(state_path)
+        # Get image count range
+        img_range = self._image_count_range.get(subset, (1, 1))
+        # Get prompt sources
+        prompt_sources = self._prompt_sources.get(subset, [])
+        info = SubsetInfo(
+            name=subset,
+            models=models,
+            experiments=experiments,
+            total_battles=state.total_battles,
+            state=state,
+            min_input_images=img_range[0],
+            max_input_images=img_range[1],
+            prompt_sources=prompt_sources,
+        )
+        self._subset_info_cache[subset] = info
+        return info
+    def _get_dataset(self, subset: str) -> Optional[ParquetDataset]:
+        """Get or create ParquetDataset for a subset."""
+        if subset not in self._dataset_cache:
+            try:
+                self._dataset_cache[subset] = ParquetDataset(self.data_dir, subset)
+            except Exception:
+                return None
+        return self._dataset_cache[subset]
+    def _get_model_manager(self, subset: str) -> Optional[GlobalModelOutputManager]:
+        """Get or create GlobalModelOutputManager for a subset."""
+        if subset not in self._model_manager_cache:
+            models_dir = os.path.join(self.arena_dir, subset, "models")
+            if os.path.isdir(models_dir):
+                self._model_manager_cache[subset] = GlobalModelOutputManager(models_dir)
+            else:
+                return None
+        return self._model_manager_cache[subset]
+    def _get_sample_data(self, subset: str, sample_index: int) -> dict[str, Any]:
+        """Get cached sample metadata."""
+        cache_key = (subset, sample_index)
+        if cache_key in self._sample_cache:
+            return self._sample_cache[cache_key]
+        # Fallback - return defaults
+        return {
+            "instruction": "",
+            "task_type": "",
+            "input_image_count": 1,
+            "prompt_source": None,
+            "original_metadata": None,
+        }
+    def _load_battle_logs(self, subset: str, exp_name: str) -> list[BattleRecord]:
+        """
+        Load battle records from log files.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name
+        Returns:
+            List of BattleRecord objects
+        """
+        cache_key = (subset, exp_name)
+        if cache_key in self._battle_cache:
+            return self._battle_cache[cache_key]
+        records: list[BattleRecord] = []
+        exp_dir = os.path.join(self.arena_dir, subset, "pk_logs", exp_name)
+        if not os.path.isdir(exp_dir):
+            return records
+        # Load slim battle logs
+        for filename in os.listdir(exp_dir):
+            if not filename.endswith(".jsonl"):
+                continue
+            filepath = os.path.join(exp_dir, filename)
+            if not os.path.isfile(filepath):
+                continue
+            try:
+                with open(filepath, "r", encoding="utf-8") as f:
+                    for line in f:
+                        line = line.strip()
+                        if not line:
+                            continue
+                        try:
+                            data = json.loads(line)
+                            sample_index = data.get("sample_index", -1)
+                            # Get cached sample data
+                            sample_meta = self._get_sample_data(subset, sample_index)
+                            record = BattleRecord(
+                                subset=subset,
+                                exp_name=exp_name,
+                                sample_index=sample_index,
+                                model_a=data.get("model_a", ""),
+                                model_b=data.get("model_b", ""),
+                                final_winner=data.get("final_winner", "tie"),
+                                is_consistent=data.get("is_consistent", False),
+                                timestamp=data.get("timestamp", ""),
+                                instruction=sample_meta.get("instruction", ""),
+                                task_type=sample_meta.get("task_type", ""),
+                                input_image_count=sample_meta.get("input_image_count", 1),
+                                prompt_source=sample_meta.get("prompt_source"),
+                                original_metadata=sample_meta.get("original_metadata"),
+                            )
+                            if record.model_a and record.model_b:
+                                records.append(record)
+                        except json.JSONDecodeError:
+                            continue
+            except Exception:
+                continue
+        # Sort by sample_index
+        records.sort(key=lambda r: r.sample_index)
+        # Cache records
+        self._battle_cache[cache_key] = records
+        # Build model index for fast filtering
+        self._build_model_index(cache_key, records)
+        return records
+    def _build_model_index(
+        self, cache_key: tuple[str, str], records: list[BattleRecord]
+    ) -> None:
+        """Build index for fast model-based filtering."""
+        model_index: dict[str, list[int]] = {}
+        for i, record in enumerate(records):
+            for model in [record.model_a, record.model_b]:
+                if model not in model_index:
+                    model_index[model] = []
+                model_index[model].append(i)
+        self._model_index[cache_key] = model_index
+    def _load_all_experiments_battles(self, subset: str) -> list[BattleRecord]:
+        """
+        Load battle records from all experiments for a subset.
+        Args:
+            subset: Subset name
+        Returns:
+            Combined list of BattleRecord objects from all experiments
+        """
+        info = self.get_subset_info(subset)
+        if not info:
+            return []
+        all_records: list[BattleRecord] = []
+        for exp_name in info.experiments:
+            records = self._load_battle_logs(subset, exp_name)
+            all_records.extend(records)
+        # Sort by sample_index for consistent ordering
+        all_records.sort(key=lambda r: (r.sample_index, r.exp_name, r.model_a, r.model_b))
+        return all_records
+    def _load_audit_log(
+        self, subset: str, exp_name: str, model_a: str, model_b: str, sample_index: int
+    ) -> Optional[dict[str, Any]]:
+        """
+        Load audit log for a specific battle.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name
+            model_a: First model name
+            model_b: Second model name
+            sample_index: Sample index
+        Returns:
+            Audit data dict or None
+        """
+        cache_key = (subset, exp_name, model_a, model_b, sample_index)
+        if cache_key in self._audit_cache:
+            return self._audit_cache[cache_key]
+        # Determine filename (models are sorted alphabetically)
+        from genarena.utils import sanitize_name
+        first, second = sorted([model_a, model_b])
+        filename = f"{sanitize_name(first)}_vs_{sanitize_name(second)}.jsonl"
+        filepath = os.path.join(
+            self.arena_dir, subset, "pk_logs", exp_name, "raw_outputs", filename
+        )
+        if not os.path.isfile(filepath):
+            return None
+        try:
+            with open(filepath, "r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        data = json.loads(line)
+                        if data.get("sample_index") == sample_index:
+                            self._audit_cache[cache_key] = data
+                            return data
+                    except json.JSONDecodeError:
+                        continue
+        except Exception:
+            pass
+        return None
+    def get_battles(
+        self,
+        subset: str,
+        exp_name: str,
+        page: int = 1,
+        page_size: int = 20,
+        models: Optional[list[str]] = None,
+        result_filter: Optional[str] = None,  # "wins", "losses", "ties"
+        consistency_filter: Optional[bool] = None,
+        min_images: Optional[int] = None,
+        max_images: Optional[int] = None,
+        prompt_source: Optional[str] = None,
+    ) -> tuple[list[BattleRecord], int]:
+        """
+        Get paginated battle records with filtering.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name (use "__all__" for all experiments)
+            page: Page number (1-indexed)
+            page_size: Number of records per page
+            models: Filter by models (show battles involving ANY of these models)
+            result_filter: Filter by result relative to models ("wins", "losses", "ties")
+            consistency_filter: Filter by consistency (True/False/None for all)
+        Returns:
+            Tuple of (records, total_count)
+        """
+        # Handle "__all__" experiment - combine all experiments
+        if exp_name == "__all__":
+            all_records = self._load_all_experiments_battles(subset)
+            # For __all__, we don't use the model index optimization
+            cache_key = None
+        else:
+            all_records = self._load_battle_logs(subset, exp_name)
+            cache_key = (subset, exp_name)
+        # Apply filters using index for better performance
+        if models and cache_key and cache_key in self._model_index:
+            model_set = set(models)
+            model_index = self._model_index[cache_key]
+            if len(models) == 1:
+                # Single model: show battles involving this model
+                candidate_indices = set(model_index.get(models[0], []))
+                filtered = [all_records[i] for i in sorted(candidate_indices)]
+            else:
+                # 2+ models: show only battles BETWEEN these models (both participants must be in selected models)
+                # Find union of all records involving any selected model first
+                candidate_indices: set[int] = set()
+                for model in models:
+                    if model in model_index:
+                        candidate_indices.update(model_index[model])
+                # Then filter to keep only battles where BOTH models are in the selected set
+                filtered = [
+                    all_records[i] for i in sorted(candidate_indices)
+                    if all_records[i].model_a in model_set and all_records[i].model_b in model_set
+                ]
+            # Apply result filter
+            if result_filter:
+                if len(models) == 1:
+                    # Single model: filter by that model's wins/losses/ties
+                    model = models[0]
+                    if result_filter == "wins":
+                        filtered = [r for r in filtered if r.final_winner == model]
+                    elif result_filter == "losses":
+                        filtered = [
+                            r
+                            for r in filtered
+                            if r.final_winner != "tie" and r.final_winner != model
+                        ]
+                    elif result_filter == "ties":
+                        filtered = [r for r in filtered if r.final_winner == "tie"]
+                elif len(models) == 2:
+                    # Two models: filter by winner (result_filter is the winning model name or "tie")
+                    if result_filter == "ties":
+                        filtered = [r for r in filtered if r.final_winner == "tie"]
+                    elif result_filter in models:
+                        # Filter by specific model winning
+                        filtered = [r for r in filtered if r.final_winner == result_filter]
+        elif models:
+            # Fallback for __all__ mode or when index is not available
+            model_set = set(models)
+            if len(models) == 1:
+                model = models[0]
+                filtered = [r for r in all_records if model in r.models]
+                # Apply result filter
+                if result_filter:
+                    if result_filter == "wins":
+                        filtered = [r for r in filtered if r.final_winner == model]
+                    elif result_filter == "losses":
+                        filtered = [
+                            r
+                            for r in filtered
+                            if r.final_winner != "tie" and r.final_winner != model
+                        ]
+                    elif result_filter == "ties":
+                        filtered = [r for r in filtered if r.final_winner == "tie"]
+            else:
+                # 2+ models: show battles between these models
+                filtered = [
+                    r for r in all_records
+                    if r.model_a in model_set and r.model_b in model_set
+                ]
+                # Apply result filter
+                if result_filter:
+                    if result_filter == "ties":
+                        filtered = [r for r in filtered if r.final_winner == "tie"]
+                    elif result_filter in models:
+                        filtered = [r for r in filtered if r.final_winner == result_filter]
+        else:
+            filtered = all_records
+        # Apply consistency filter
+        if consistency_filter is not None:
+            filtered = [r for r in filtered if r.is_consistent == consistency_filter]
+        # Apply input image count filter
+        if min_images is not None or max_images is not None:
+            min_img = min_images if min_images is not None else 0
+            max_img = max_images if max_images is not None else float('inf')
+            filtered = [r for r in filtered if min_img <= r.input_image_count <= max_img]
+        # Apply prompt_source filter
+        if prompt_source:
+            filtered = [r for r in filtered if r.prompt_source == prompt_source]
+        total_count = len(filtered)
+        # Paginate
+        start = (page - 1) * page_size
+        end = start + page_size
+        page_records = filtered[start:end]
+        return page_records, total_count
+    def search_battles(
+        self,
+        subset: str,
+        exp_name: str,
+        query: str,
+        page: int = 1,
+        page_size: int = 20,
+        models: Optional[list[str]] = None,
+        consistency_filter: Optional[bool] = None,
+        search_fields: Optional[list[str]] = None,
+    ) -> tuple[list[BattleRecord], int]:
+        """
+        Search battle records by text query (full-text search).
+        Searches across instruction, task_type, prompt_source, and original_metadata.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name (use "__all__" for all experiments)
+            query: Search query string (case-insensitive)
+            page: Page number (1-indexed)
+            page_size: Number of records per page
+            models: Optional filter by models
+            consistency_filter: Optional filter by consistency
+            search_fields: Fields to search in (default: all searchable fields)
+        Returns:
+            Tuple of (matching_records, total_count)
+        """
+        if not query or not query.strip():
+            # Empty query - return regular filtered results
+            return self.get_battles(
+                subset, exp_name, page, page_size,
+                models=models, consistency_filter=consistency_filter
+            )
+        # Normalize query for case-insensitive search
+        query_lower = query.lower().strip()
+        # Create regex pattern for more flexible matching
+        query_pattern = re.compile(re.escape(query_lower), re.IGNORECASE)
+        # Determine which fields to search
+        all_searchable_fields = ["instruction", "task_type", "prompt_source", "original_metadata"]
+        fields_to_search = search_fields if search_fields else all_searchable_fields
+        # Load all records
+        if exp_name == "__all__":
+            all_records = self._load_all_experiments_battles(subset)
+        else:
+            all_records = self._load_battle_logs(subset, exp_name)
+        # Apply model filter first (for efficiency)
+        if models:
+            model_set = set(models)
+            if len(models) == 1:
+                all_records = [r for r in all_records if models[0] in r.models]
+            else:
+                all_records = [
+                    r for r in all_records
+                    if r.model_a in model_set and r.model_b in model_set
+                ]
+        # Apply consistency filter
+        if consistency_filter is not None:
+            all_records = [r for r in all_records if r.is_consistent == consistency_filter]
+        # Search filter
+        def matches_query(record: BattleRecord) -> bool:
+            """Check if record matches the search query."""
+            for field_name in fields_to_search:
+                value = getattr(record, field_name, None)
+                if value is None:
+                    continue
+                # Handle different field types
+                if field_name == "original_metadata" and isinstance(value, dict):
+                    # Search in JSON string representation of metadata
+                    metadata_str = json.dumps(value, ensure_ascii=False).lower()
+                    if query_pattern.search(metadata_str):
+                        return True
+                elif isinstance(value, str):
+                    if query_pattern.search(value):
+                        return True
+            return False
+        # Apply search filter
+        filtered = [r for r in all_records if matches_query(r)]
+        total_count = len(filtered)
+        # Paginate
+        start = (page - 1) * page_size
+        end = start + page_size
+        page_records = filtered[start:end]
+        return page_records, total_count
+    def search_prompts(
+        self,
+        subset: str,
+        exp_name: str,
+        query: str,
+        page: int = 1,
+        page_size: int = 10,
+        filter_models: Optional[list[str]] = None,
+    ) -> tuple[list[dict[str, Any]], int]:
+        """
+        Search prompts/samples by text query.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name (use "__all__" for all experiments)
+            query: Search query string
+            page: Page number
+            page_size: Records per page
+            filter_models: Optional filter by models
+        Returns:
+            Tuple of (matching_prompts, total_count)
+        """
+        if not query or not query.strip():
+            # Empty query - return regular results
+            return self.get_prompts(subset, exp_name, page, page_size, filter_models=filter_models)
+        # Normalize query
+        query_lower = query.lower().strip()
+        query_pattern = re.compile(re.escape(query_lower), re.IGNORECASE)
+        # Load records and group by sample
+        if exp_name == "__all__":
+            all_records = self._load_all_experiments_battles(subset)
+        else:
+            all_records = self._load_battle_logs(subset, exp_name)
+        # Group by sample_index
+        sample_records: dict[int, list[BattleRecord]] = {}
+        for record in all_records:
+            if record.sample_index not in sample_records:
+                sample_records[record.sample_index] = []
+            sample_records[record.sample_index].append(record)
+        # Filter samples by query
+        matching_samples = []
+        for sample_index, records in sample_records.items():
+            if not records:
+                continue
+            first_record = records[0]
+            # Search in instruction, task_type, prompt_source, original_metadata
+            match_found = False
+            if first_record.instruction and query_pattern.search(first_record.instruction):
+                match_found = True
+            elif first_record.task_type and query_pattern.search(first_record.task_type):
+                match_found = True
+            elif first_record.prompt_source and query_pattern.search(first_record.prompt_source):
+                match_found = True
+            elif first_record.original_metadata:
+                metadata_str = json.dumps(first_record.original_metadata, ensure_ascii=False).lower()
+                if query_pattern.search(metadata_str):
+                    match_found = True
+            if match_found:
+                matching_samples.append(sample_index)
+        # Sort and paginate
+        matching_samples.sort()
+        total_count = len(matching_samples)
+        start = (page - 1) * page_size
+        end = start + page_size
+        page_samples = matching_samples[start:end]
+        # Build result for each sample using get_sample_all_models
+        results = []
+        for sample_index in page_samples:
+            prompt_data = self.get_sample_all_models(subset, exp_name, sample_index, filter_models)
+            results.append(prompt_data)
+        return results, total_count
+    def get_battle_detail(
+        self, subset: str, exp_name: str, model_a: str, model_b: str, sample_index: int
+    ) -> Optional[BattleRecord]:
+        """
+        Get detailed battle record including VLM outputs.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name (use "__all__" for all experiments)
+            model_a: First model name
+            model_b: Second model name
+            sample_index: Sample index
+        Returns:
+            BattleRecord with audit data, or None
+        """
+        # Find the battle record
+        if exp_name == "__all__":
+            all_records = self._load_all_experiments_battles(subset)
+        else:
+            all_records = self._load_battle_logs(subset, exp_name)
+        record = None
+        for r in all_records:
+            if (
+                r.sample_index == sample_index
+                and set([r.model_a, r.model_b]) == set([model_a, model_b])
+            ):
+                record = r
+                break
+        if not record:
+            return None
+        # Load audit data (use the record's actual exp_name for audit log lookup)
+        actual_exp_name = record.exp_name
+        audit = self._load_audit_log(
+            subset, actual_exp_name, record.model_a, record.model_b, sample_index
+        )
+        if audit:
+            record.original_call = audit.get("original_call")
+            record.swapped_call = audit.get("swapped_call")
+        return record
+    def get_image_path(
+        self, subset: str, model: str, sample_index: int
+    ) -> Optional[str]:
+        """
+        Get path to model output image.
+        Args:
+            subset: Subset name
+            model: Model name
+            sample_index: Sample index
+        Returns:
+            Image file path or None
+        """
+        model_manager = self._get_model_manager(subset)
+        if model_manager:
+            return model_manager.get_output_path(model, sample_index)
+        return None
+    def get_input_image(self, subset: str, sample_index: int) -> Optional[bytes]:
+        """
+        Get input image bytes for a sample.
+        Uses pyarrow to read directly from parquet for better performance.
+        Uses cached file mapping for fast lookup.
+        Args:
+            subset: Subset name
+            sample_index: Sample index
+        Returns:
+            Image bytes or None
+        """
+        import pyarrow.parquet as pq
+        # Use cached file mapping if available (fast path)
+        cache_key = (subset, sample_index)
+        if cache_key in self._sample_file_map:
+            pf = self._sample_file_map[cache_key]
+            result = self._read_image_from_parquet(pf, sample_index)
+            if result is not None:
+                return result
+        # Fallback: search all parquet files (slow path)
+        subset_path = os.path.join(self.data_dir, subset)
+        if not os.path.isdir(subset_path):
+            return None
+        parquet_files = sorted([
+            os.path.join(subset_path, f)
+            for f in os.listdir(subset_path)
+            if f.startswith("data-") and f.endswith(".parquet")
+        ])
+        for pf in parquet_files:
+            result = self._read_image_from_parquet(pf, sample_index)
+            if result is not None:
+                return result
+        return None
+    def _read_image_from_parquet(self, parquet_file: str, sample_index: int) -> Optional[bytes]:
+        """Read a single image from a parquet file."""
+        import pyarrow.parquet as pq
+        try:
+            table = pq.read_table(parquet_file, columns=["index", "input_images"])
+            indices = table.column("index").to_pylist()
+            if sample_index not in indices:
+                return None
+            row_idx = indices.index(sample_index)
+            input_images = table.column("input_images")[row_idx].as_py()
+            if not input_images or len(input_images) == 0:
+                return None
+            img_data = input_images[0]
+            # Handle different formats
+            if isinstance(img_data, bytes):
+                return img_data
+            elif isinstance(img_data, dict):
+                # HuggingFace Image format: {"bytes": ..., "path": ...}
+                if "bytes" in img_data and img_data["bytes"]:
+                    return img_data["bytes"]
+                elif "path" in img_data and img_data["path"]:
+                    path = img_data["path"]
+                    if os.path.isfile(path):
+                        with open(path, "rb") as f:
+                            return f.read()
+        except Exception as e:
+            logger.debug(f"Error reading image from {parquet_file}: {e}")
+        return None
+    def get_input_image_count(self, subset: str, sample_index: int) -> int:
+        """Get the number of input images for a sample."""
+        import pyarrow.parquet as pq
+        cache_key = (subset, sample_index)
+        if cache_key in self._sample_file_map:
+            pf = self._sample_file_map[cache_key]
+            try:
+                table = pq.read_table(pf, columns=["index", "input_images"])
+                indices = table.column("index").to_pylist()
+                if sample_index in indices:
+                    row_idx = indices.index(sample_index)
+                    input_images = table.column("input_images")[row_idx].as_py()
+                    return len(input_images) if input_images else 0
+            except Exception:
+                pass
+        return 1  # Default to 1
+    def get_input_image_by_idx(self, subset: str, sample_index: int, img_idx: int = 0) -> Optional[bytes]:
+        """Get a specific input image by index."""
+        import pyarrow.parquet as pq
+        cache_key = (subset, sample_index)
+        if cache_key not in self._sample_file_map:
+            return None
+        pf = self._sample_file_map[cache_key]
+        try:
+            table = pq.read_table(pf, columns=["index", "input_images"])
+            indices = table.column("index").to_pylist()
+            if sample_index not in indices:
+                return None
+            row_idx = indices.index(sample_index)
+            input_images = table.column("input_images")[row_idx].as_py()
+            if not input_images or img_idx >= len(input_images):
+                return None
+            img_data = input_images[img_idx]
+            if isinstance(img_data, bytes):
+                return img_data
+            elif isinstance(img_data, dict):
+                if "bytes" in img_data and img_data["bytes"]:
+                    return img_data["bytes"]
+                elif "path" in img_data and img_data["path"]:
+                    path = img_data["path"]
+                    if os.path.isfile(path):
+                        with open(path, "rb") as f:
+                            return f.read()
+        except Exception as e:
+            logger.debug(f"Error reading image: {e}")
+        return None
+    def get_head_to_head(
+        self, subset: str, exp_name: str, model_a: str, model_b: str
+    ) -> dict[str, Any]:
+        """
+        Get head-to-head statistics between two models.
+        Returns:
+            Dict with wins_a, wins_b, ties, total, win_rate_a, win_rate_b
+        """
+        if exp_name == "__all__":
+            all_records = self._load_all_experiments_battles(subset)
+            # For __all__, we need to filter manually
+            h2h_records = [
+                r for r in all_records
+                if set([r.model_a, r.model_b]) == set([model_a, model_b])
+            ]
+        else:
+            all_records = self._load_battle_logs(subset, exp_name)
+            cache_key = (subset, exp_name)
+            model_index = self._model_index.get(cache_key, {})
+            # Find battles between these two models
+            indices_a = set(model_index.get(model_a, []))
+            indices_b = set(model_index.get(model_b, []))
+            h2h_indices = indices_a & indices_b
+            h2h_records = [all_records[idx] for idx in h2h_indices]
+        wins_a = 0
+        wins_b = 0
+        ties = 0
+        for record in h2h_records:
+            if record.final_winner == model_a:
+                wins_a += 1
+            elif record.final_winner == model_b:
+                wins_b += 1
+            else:
+                ties += 1
+        total = wins_a + wins_b + ties
+        return {
+            "model_a": model_a,
+            "model_b": model_b,
+            "wins_a": wins_a,
+            "wins_b": wins_b,
+            "ties": ties,
+            "total": total,
+            "win_rate_a": wins_a / total if total > 0 else 0,
+            "win_rate_b": wins_b / total if total > 0 else 0,
+            "tie_rate": ties / total if total > 0 else 0,
+        }
+    def get_win_rate_matrix(
+        self,
+        subset: str,
+        exp_name: str = "__all__",
+        filter_models: Optional[list[str]] = None,
+    ) -> dict[str, Any]:
+        """
+        Compute win rate matrix for all model pairs.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name (use "__all__" for all experiments)
+            filter_models: Optional list of models to include
+        Returns:
+            Dict with:
+            - models: List of model names (sorted by ELO)
+            - matrix: 2D array where matrix[i][j] = win rate of model i vs model j
+            - counts: 2D array where counts[i][j] = number of battles between i and j
+            - wins: 2D array where wins[i][j] = wins of model i vs model j
+        """
+        # Load all records
+        if exp_name == "__all__":
+            all_records = self._load_all_experiments_battles(subset)
+        else:
+            all_records = self._load_battle_logs(subset, exp_name)
+        # Determine models to include
+        info = self.get_subset_info(subset)
+        if filter_models:
+            models = [m for m in filter_models if m in info.models]
+        else:
+            models = list(info.models)
+        # Get ELO leaderboard to sort models by ELO
+        leaderboard = self.get_elo_leaderboard(subset, models)
+        models = [entry["model"] for entry in leaderboard]
+        n = len(models)
+        model_to_idx = {m: i for i, m in enumerate(models)}
+        # Initialize matrices
+        wins_matrix = [[0] * n for _ in range(n)]
+        counts_matrix = [[0] * n for _ in range(n)]
+        # Count wins for each pair
+        model_set = set(models)
+        for record in all_records:
+            if record.model_a not in model_set or record.model_b not in model_set:
+                continue
+            i = model_to_idx[record.model_a]
+            j = model_to_idx[record.model_b]
+            # Count total battles (symmetric)
+            counts_matrix[i][j] += 1
+            counts_matrix[j][i] += 1
+            # Count wins
+            if record.final_winner == record.model_a:
+                wins_matrix[i][j] += 1
+            elif record.final_winner == record.model_b:
+                wins_matrix[j][i] += 1
+            else:
+                # Tie counts as 0.5 win for each
+                wins_matrix[i][j] += 0.5
+                wins_matrix[j][i] += 0.5
+        # Compute win rate matrix
+        win_rate_matrix = [[0.0] * n for _ in range(n)]
+        for i in range(n):
+            for j in range(n):
+                if counts_matrix[i][j] > 0:
+                    win_rate_matrix[i][j] = wins_matrix[i][j] / counts_matrix[i][j]
+                elif i == j:
+                    win_rate_matrix[i][j] = 0.5  # Self vs self
+        return {
+            "models": models,
+            "matrix": win_rate_matrix,
+            "counts": counts_matrix,
+            "wins": wins_matrix,
+        }
+    def get_elo_by_source(
+        self,
+        subset: str,
+        exp_name: str = "__all__",
+    ) -> dict[str, Any]:
+        """
+        Compute ELO rankings grouped by prompt_source.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name
+        Returns:
+            Dict with:
+            - sources: List of source names
+            - leaderboards: Dict mapping source -> list of model ELO entries
+            - sample_counts: Dict mapping source -> number of samples
+            - battle_counts: Dict mapping source -> number of battles
+        """
+        from genarena.bt_elo import compute_bt_elo_ratings
+        # Load all records
+        if exp_name == "__all__":
+            all_records = self._load_all_experiments_battles(subset)
+        else:
+            all_records = self._load_battle_logs(subset, exp_name)
+        # Group battles by prompt_source
+        battles_by_source: dict[str, list[tuple[str, str, str]]] = {}
+        sample_counts: dict[str, set[int]] = {}
+        for record in all_records:
+            source = record.prompt_source or "unknown"
+            if source not in battles_by_source:
+                battles_by_source[source] = []
+                sample_counts[source] = set()
+            # Convert winner to bt_elo format
+            if record.final_winner == record.model_a:
+                winner = "model_a"
+            elif record.final_winner == record.model_b:
+                winner = "model_b"
+            else:
+                winner = "tie"
+            battles_by_source[source].append((record.model_a, record.model_b, winner))
+            sample_counts[source].add(record.sample_index)
+        # Compute ELO for each source
+        leaderboards: dict[str, list[dict[str, Any]]] = {}
+        battle_counts: dict[str, int] = {}
+        for source, battles in battles_by_source.items():
+            if not battles:
+                continue
+            battle_counts[source] = len(battles)
+            try:
+                ratings = compute_bt_elo_ratings(battles)
+                # Build leaderboard
+                entries = []
+                for model, elo in ratings.items():
+                    # Count wins/losses/ties for this model in this source
+                    wins = losses = ties = 0
+                    for ma, mb, w in battles:
+                        if model == ma:
+                            if w == "model_a":
+                                wins += 1
+                            elif w == "model_b":
+                                losses += 1
+                            else:
+                                ties += 1
+                        elif model == mb:
+                            if w == "model_b":
+                                wins += 1
+                            elif w == "model_a":
+                                losses += 1
+                            else:
+                                ties += 1
+                    total = wins + losses + ties
+                    entries.append({
+                        "model": model,
+                        "elo": round(elo, 1),
+                        "wins": wins,
+                        "losses": losses,
+                        "ties": ties,
+                        "total": total,
+                        "win_rate": (wins + 0.5 * ties) / total if total > 0 else 0,
+                    })
+                # Sort by ELO descending
+                entries.sort(key=lambda x: -x["elo"])
+                leaderboards[source] = entries
+            except Exception as e:
+                logger.warning(f"Failed to compute ELO for source {source}: {e}")
+                continue
+        # Sort sources by battle count
+        sources = sorted(battle_counts.keys(), key=lambda s: -battle_counts[s])
+        return {
+            "sources": sources,
+            "leaderboards": leaderboards,
+            "sample_counts": {s: len(sample_counts[s]) for s in sources},
+            "battle_counts": battle_counts,
+        }
+    def _load_elo_snapshot(self, snapshot_path: str) -> Optional[dict[str, Any]]:
+        """
+        Load ELO snapshot from a JSON file.
+        Args:
+            snapshot_path: Path to elo_snapshot.json
+        Returns:
+            Dict with elo ratings and metadata, or None if not found
+        """
+        if not os.path.isfile(snapshot_path):
+            return None
+        try:
+            with open(snapshot_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            if not isinstance(data, dict):
+                return None
+            # Extract ELO ratings (support both {"elo": {...}} and direct {model: elo} format)
+            elo_data = data.get("elo") if isinstance(data.get("elo"), dict) else data
+            if not isinstance(elo_data, dict):
+                return None
+            return {
+                "elo": {str(k): float(v) for k, v in elo_data.items()},
+                "battle_count": data.get("battle_count", 0),
+                "model_count": data.get("model_count", len(elo_data)),
+                "exp_name": data.get("exp_name", ""),
+            }
+        except Exception as e:
+            logger.debug(f"Failed to load ELO snapshot from {snapshot_path}: {e}")
+            return None
+    def get_elo_history(
+        self,
+        subset: str,
+        exp_name: str = "__all__",
+        granularity: str = "experiment",
+        filter_models: Optional[list[str]] = None,
+        max_points: int = 50,
+    ) -> dict[str, Any]:
+        """
+        Get ELO history over experiments by reading pre-computed elo_snapshot.json files.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name (only "__all__" or "experiment" granularity supported)
+            granularity: Grouping method ("experiment" reads from snapshots; time-based not supported)
+            filter_models: Optional models to track
+            max_points: Maximum number of time points to return
+        Returns:
+            Dict with:
+            - timestamps: List of experiment names
+            - models: Dict mapping model -> list of ELO values
+            - battle_counts: List of cumulative battle counts
+        """
+        # Get subset info for experiment order
+        info = self.get_subset_info(subset)
+        if not info:
+            return {"timestamps": [], "models": {}, "battle_counts": []}
+        # Only support experiment-level granularity (reading from snapshots)
+        # Time-based granularity would require real-time computation which we want to avoid
+        if granularity != "experiment":
+            logger.warning(
+                f"Time-based granularity '{granularity}' is not supported for ELO history. "
+                f"Falling back to 'experiment' granularity."
+            )
+        # Get ordered list of experiments
+        experiments = info.experiments
+        if not experiments:
+            return {"timestamps": [], "models": {}, "battle_counts": []}
+        # If too many experiments, sample them
+        if len(experiments) > max_points:
+            step = len(experiments) // max_points
+            sampled = [experiments[i] for i in range(0, len(experiments), step)]
+            if sampled[-1] != experiments[-1]:
+                sampled.append(experiments[-1])
+            experiments = sampled
+        # Load ELO snapshots for each experiment
+        timestamps: list[str] = []
+        model_elos: dict[str, list[Optional[float]]] = {}
+        battle_counts: list[int] = []
+        pk_logs_dir = os.path.join(self.arena_dir, subset, "pk_logs")
+        for exp in experiments:
+            snapshot_path = os.path.join(pk_logs_dir, exp, "elo_snapshot.json")
+            snapshot = self._load_elo_snapshot(snapshot_path)
+            if snapshot is None:
+                # Skip experiments without snapshots
+                continue
+            elo_ratings = snapshot["elo"]
+            battle_count = snapshot["battle_count"]
+            timestamps.append(exp)
+            battle_counts.append(battle_count)
+            # Update model ELOs
+            all_models_so_far = set(model_elos.keys()) | set(elo_ratings.keys())
+            for model in all_models_so_far:
+                if model not in model_elos:
+                    # New model: fill with None for previous timestamps
+                    model_elos[model] = [None] * (len(timestamps) - 1)
+                model_elos[model].append(elo_ratings.get(model))
+            # Ensure all models have the same length
+            for model in model_elos:
+                if len(model_elos[model]) < len(timestamps):
+                    model_elos[model].append(None)
+        # Filter to requested models if specified
+        if filter_models:
+            filter_set = set(filter_models)
+            model_elos = {m: v for m, v in model_elos.items() if m in filter_set}
+        return {
+            "timestamps": timestamps,
+            "models": model_elos,
+            "battle_counts": battle_counts,
+        }
+    def get_cross_subset_info(
+        self,
+        subsets: list[str],
+    ) -> dict[str, Any]:
+        """
+        Get information about models across multiple subsets.
+        Args:
+            subsets: List of subset names
+        Returns:
+            Dict with:
+            - common_models: Models present in all subsets
+            - all_models: Models present in any subset
+            - per_subset_models: Dict mapping subset -> list of models
+            - per_subset_battles: Dict mapping subset -> battle count
+        """
+        per_subset_models: dict[str, set[str]] = {}
+        per_subset_battles: dict[str, int] = {}
+        for subset in subsets:
+            info = self.get_subset_info(subset)
+            if info:
+                per_subset_models[subset] = set(info.models)
+                per_subset_battles[subset] = info.total_battles
+        if not per_subset_models:
+            return {
+                "common_models": [],
+                "all_models": [],
+                "per_subset_models": {},
+                "per_subset_battles": {},
+            }
+        # Compute intersection and union
+        all_model_sets = list(per_subset_models.values())
+        common_models = set.intersection(*all_model_sets) if all_model_sets else set()
+        all_models = set.union(*all_model_sets) if all_model_sets else set()
+        return {
+            "common_models": sorted(common_models),
+            "all_models": sorted(all_models),
+            "per_subset_models": {s: sorted(m) for s, m in per_subset_models.items()},
+            "per_subset_battles": per_subset_battles,
+            "total_battles": sum(per_subset_battles.values()),
+        }
+    def get_cross_subset_elo(
+        self,
+        subsets: list[str],
+        exp_name: str = "__all__",
+        model_scope: str = "all",
+    ) -> dict[str, Any]:
+        """
+        Compute ELO rankings across multiple subsets.
+        Args:
+            subsets: List of subset names
+            exp_name: Experiment name (use "__all__" for all)
+            model_scope: "common" = only models in all subsets, "all" = all models
+        Returns:
+            Dict with merged leaderboard and per-subset comparison
+        """
+        # Check cache first
+        cache_key = (tuple(sorted(subsets)), exp_name, model_scope)
+        if cache_key in self._cross_subset_elo_cache:
+            return self._cross_subset_elo_cache[cache_key]
+        from genarena.bt_elo import compute_bt_elo_ratings
+        # Get cross-subset info
+        cross_info = self.get_cross_subset_info(subsets)
+        # Determine models to include
+        if model_scope == "common":
+            included_models = set(cross_info["common_models"])
+        else:
+            included_models = set(cross_info["all_models"])
+        if not included_models:
+            return {
+                "subsets": subsets,
+                "model_scope": model_scope,
+                "common_models": cross_info["common_models"],
+                "all_models": cross_info["all_models"],
+                "total_battles": 0,
+                "leaderboard": [],
+                "per_subset_elo": {},
+            }
+        # Collect all battles
+        all_battles = []
+        model_presence: dict[str, set[str]] = {}  # model -> set of subsets it's in
+        for subset in subsets:
+            if exp_name == "__all__":
+                records = self._load_all_experiments_battles(subset)
+            else:
+                records = self._load_battle_logs(subset, exp_name)
+            for record in records:
+                # Skip if either model is not in included set
+                if model_scope == "common":
+                    if record.model_a not in included_models or record.model_b not in included_models:
+                        continue
+                # Convert to bt_elo format
+                if record.final_winner == record.model_a:
+                    winner = "model_a"
+                elif record.final_winner == record.model_b:
+                    winner = "model_b"
+                else:
+                    winner = "tie"
+                all_battles.append((record.model_a, record.model_b, winner))
+                # Track model presence
+                for m in [record.model_a, record.model_b]:
+                    if m not in model_presence:
+                        model_presence[m] = set()
+                    model_presence[m].add(subset)
+        if not all_battles:
+            return {
+                "subsets": subsets,
+                "model_scope": model_scope,
+                "common_models": cross_info["common_models"],
+                "all_models": cross_info["all_models"],
+                "total_battles": 0,
+                "leaderboard": [],
+                "per_subset_elo": {},
+            }
+        # Compute merged ELO
+        try:
+            ratings = compute_bt_elo_ratings(all_battles)
+        except Exception as e:
+            logger.error(f"Failed to compute cross-subset ELO: {e}")
+            return {
+                "subsets": subsets,
+                "model_scope": model_scope,
+                "error": str(e),
+                "total_battles": len(all_battles),
+                "leaderboard": [],
+            }
+        # Count wins/losses/ties per model
+        model_stats: dict[str, dict[str, int]] = {}
+        for ma, mb, winner in all_battles:
+            for m in [ma, mb]:
+                if m not in model_stats:
+                    model_stats[m] = {"wins": 0, "losses": 0, "ties": 0}
+            if winner == "model_a":
+                model_stats[ma]["wins"] += 1
+                model_stats[mb]["losses"] += 1
+            elif winner == "model_b":
+                model_stats[mb]["wins"] += 1
+                model_stats[ma]["losses"] += 1
+            else:
+                model_stats[ma]["ties"] += 1
+                model_stats[mb]["ties"] += 1
+        # Build leaderboard
+        leaderboard = []
+        for model, elo in ratings.items():
+            stats = model_stats.get(model, {"wins": 0, "losses": 0, "ties": 0})
+            total = stats["wins"] + stats["losses"] + stats["ties"]
+            leaderboard.append({
+                "model": model,
+                "elo": round(elo, 1),
+                "wins": stats["wins"],
+                "losses": stats["losses"],
+                "ties": stats["ties"],
+                "total": total,
+                "win_rate": (stats["wins"] + 0.5 * stats["ties"]) / total if total > 0 else 0,
+                "subset_presence": sorted(model_presence.get(model, set())),
+            })
+        leaderboard.sort(key=lambda x: -x["elo"])
+        # Get per-subset ELO for comparison
+        per_subset_elo: dict[str, dict[str, float]] = {}
+        for subset in subsets:
+            subset_lb = self.get_elo_leaderboard(subset)
+            per_subset_elo[subset] = {entry["model"]: entry["elo"] for entry in subset_lb}
+        result = {
+            "subsets": subsets,
+            "model_scope": model_scope,
+            "common_models": cross_info["common_models"],
+            "all_models": cross_info["all_models"],
+            "total_battles": len(all_battles),
+            "leaderboard": leaderboard,
+            "per_subset_elo": per_subset_elo,
+        }
+        # Cache the result
+        self._cross_subset_elo_cache[cache_key] = result
+        return result
+    def get_stats(self, subset: str, exp_name: Optional[str] = None) -> dict[str, Any]:
+        """
+        Get statistics for a subset.
+        Args:
+            subset: Subset name
+            exp_name: Optional experiment name (if None, uses overall state; "__all__" for all experiments)
+        Returns:
+            Statistics dictionary
+        """
+        info = self.get_subset_info(subset)
+        if not info:
+            return {}
+        if exp_name == "__all__":
+            # Combine stats from all experiments
+            records = self._load_all_experiments_battles(subset)
+            total_battles = len(records)
+            consistent = sum(1 for r in records if r.is_consistent)
+            ties = sum(1 for r in records if r.final_winner == "tie")
+        elif exp_name:
+            records = self._load_battle_logs(subset, exp_name)
+            total_battles = len(records)
+            consistent = sum(1 for r in records if r.is_consistent)
+            ties = sum(1 for r in records if r.final_winner == "tie")
+        else:
+            total_battles = info.total_battles
+            consistent = 0
+            ties = 0
+        return {
+            "subset": subset,
+            "models": info.models,
+            "experiments": info.experiments,
+            "total_battles": total_battles,
+            "consistent_battles": consistent,
+            "tie_battles": ties,
+            "consistency_rate": consistent / total_battles if total_battles > 0 else 0,
+        }
+    def get_model_win_stats(
+        self, subset: str, exp_name: str, sample_index: int,
+        filter_models: Optional[list[str]] = None
+    ) -> dict[str, dict[str, Any]]:
+        """
+        Get win/loss statistics for all models on a specific sample.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name (use "__all__" for all experiments)
+            sample_index: Sample index
+            filter_models: Optional list of models to filter (only count battles between these models)
+        Returns:
+            Dict mapping model name to stats (wins, losses, ties, total, win_rate)
+        """
+        if exp_name == "__all__":
+            all_records = self._load_all_experiments_battles(subset)
+        else:
+            all_records = self._load_battle_logs(subset, exp_name)
+        # Filter records for this sample
+        sample_records = [r for r in all_records if r.sample_index == sample_index]
+        # If filter_models is specified, only count battles between those models
+        if filter_models:
+            filter_set = set(filter_models)
+            sample_records = [
+                r for r in sample_records
+                if r.model_a in filter_set and r.model_b in filter_set
+            ]
+        # Collect stats per model
+        model_stats: dict[str, dict[str, int]] = {}
+        for record in sample_records:
+            for model in [record.model_a, record.model_b]:
+                if model not in model_stats:
+                    model_stats[model] = {"wins": 0, "losses": 0, "ties": 0}
+            if record.final_winner == "tie":
+                model_stats[record.model_a]["ties"] += 1
+                model_stats[record.model_b]["ties"] += 1
+            elif record.final_winner == record.model_a:
+                model_stats[record.model_a]["wins"] += 1
+                model_stats[record.model_b]["losses"] += 1
+            elif record.final_winner == record.model_b:
+                model_stats[record.model_b]["wins"] += 1
+                model_stats[record.model_a]["losses"] += 1
+        # Calculate win rate and total
+        result: dict[str, dict[str, Any]] = {}
+        for model, stats in model_stats.items():
+            total = stats["wins"] + stats["losses"] + stats["ties"]
+            win_rate = stats["wins"] / total if total > 0 else 0
+            result[model] = {
+                "wins": stats["wins"],
+                "losses": stats["losses"],
+                "ties": stats["ties"],
+                "total": total,
+                "win_rate": win_rate,
+            }
+        return result
+    def get_sample_all_models(
+        self, subset: str, exp_name: str, sample_index: int,
+        filter_models: Optional[list[str]] = None,
+        stats_scope: str = "filtered"
+    ) -> dict[str, Any]:
+        """
+        Get all model outputs for a specific sample, sorted by win rate.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name
+            sample_index: Sample index
+            filter_models: Optional list of models to filter (show only these models)
+            stats_scope: 'filtered' = only count battles between filtered models,
+                        'all' = count all battles (but show only filtered models)
+        Returns:
+            Dict with sample info and all model outputs sorted by win rate
+        """
+        # Get sample metadata
+        sample_meta = self._get_sample_data(subset, sample_index)
+        # Determine which models to use for stats calculation
+        # If stats_scope is 'all', don't filter battles by models
+        stats_filter = filter_models if stats_scope == "filtered" else None
+        model_stats = self.get_model_win_stats(subset, exp_name, sample_index, stats_filter)
+        # Get all models that have outputs
+        model_manager = self._get_model_manager(subset)
+        available_models = []
+        if model_manager:
+            # Determine which models to include
+            models_to_check = model_manager.models
+            if filter_models:
+                filter_set = set(filter_models)
+                models_to_check = [m for m in models_to_check if m in filter_set]
+            for model in models_to_check:
+                output_path = model_manager.get_output_path(model, sample_index)
+                if output_path and os.path.isfile(output_path):
+                    stats = model_stats.get(model, {
+                        "wins": 0, "losses": 0, "ties": 0, "total": 0, "win_rate": 0
+                    })
+                    available_models.append({
+                        "model": model,
+                        "wins": stats["wins"],
+                        "losses": stats["losses"],
+                        "ties": stats["ties"],
+                        "total": stats["total"],
+                        "win_rate": stats["win_rate"],
+                    })
+        # Sort by win rate (descending), then by wins (descending), then by model name
+        available_models.sort(key=lambda x: (-x["win_rate"], -x["wins"], x["model"]))
+        return {
+            "subset": subset,
+            "exp_name": exp_name,
+            "sample_index": sample_index,
+            "instruction": sample_meta.get("instruction", ""),
+            "task_type": sample_meta.get("task_type", ""),
+            "input_image_count": sample_meta.get("input_image_count", 1),
+            "prompt_source": sample_meta.get("prompt_source"),
+            "original_metadata": sample_meta.get("original_metadata"),
+            "models": available_models,
+        }
+    def get_model_battles_for_sample(
+        self,
+        subset: str,
+        exp_name: str,
+        sample_index: int,
+        model: str,
+        opponent_models: Optional[list[str]] = None,
+    ) -> dict[str, Any]:
+        """
+        Get all battle records for a specific model on a specific sample.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name (use "__all__" for all experiments)
+            sample_index: Sample index
+            model: The model to get battles for
+            opponent_models: Optional list of opponent models to filter by
+        Returns:
+            Dict with model info and list of battle records
+        """
+        if exp_name == "__all__":
+            all_records = self._load_all_experiments_battles(subset)
+        else:
+            all_records = self._load_battle_logs(subset, exp_name)
+        # Filter records for this sample and involving this model
+        model_battles = []
+        all_opponents = set()
+        for record in all_records:
+            if record.sample_index != sample_index:
+                continue
+            if model not in [record.model_a, record.model_b]:
+                continue
+            # Determine opponent
+            opponent = record.model_b if record.model_a == model else record.model_a
+            all_opponents.add(opponent)
+            # Apply opponent filter if specified
+            if opponent_models and opponent not in opponent_models:
+                continue
+            # Determine result for this model
+            if record.final_winner == "tie":
+                result = "tie"
+            elif record.final_winner == model:
+                result = "win"
+            else:
+                result = "loss"
+            # Build battle data with judge outputs
+            battle_data = {
+                "opponent": opponent,
+                "result": result,
+                "is_consistent": record.is_consistent,
+                "model_a": record.model_a,
+                "model_b": record.model_b,
+                "final_winner": record.final_winner,
+                "exp_name": record.exp_name,
+            }
+            # Load audit logs if not already loaded on the record
+            if not record.original_call and not record.swapped_call:
+                actual_exp_name = record.exp_name
+                audit = self._load_audit_log(
+                    subset, actual_exp_name, record.model_a, record.model_b, sample_index
+                )
+                if audit:
+                    battle_data["original_call"] = audit.get("original_call")
+                    battle_data["swapped_call"] = audit.get("swapped_call")
+            else:
+                # Use existing data if available
+                if record.original_call:
+                    battle_data["original_call"] = record.original_call
+                if record.swapped_call:
+                    battle_data["swapped_call"] = record.swapped_call
+            model_battles.append(battle_data)
+        # Sort battles by opponent name
+        model_battles.sort(key=lambda x: x["opponent"])
+        # Get model stats
+        model_stats = self.get_model_win_stats(subset, exp_name, sample_index)
+        stats = model_stats.get(model, {
+            "wins": 0, "losses": 0, "ties": 0, "total": 0, "win_rate": 0
+        })
+        return {
+            "model": model,
+            "sample_index": sample_index,
+            "wins": stats["wins"],
+            "losses": stats["losses"],
+            "ties": stats["ties"],
+            "total": stats["total"],
+            "win_rate": stats["win_rate"],
+            "battles": model_battles,
+            "all_opponents": sorted(list(all_opponents)),
+        }
+    def get_elo_leaderboard(
+        self,
+        subset: str,
+        filter_models: Optional[list[str]] = None,
+    ) -> list[dict[str, Any]]:
+        """
+        Get ELO leaderboard for a subset from state.json.
+        Args:
+            subset: Subset name
+            filter_models: Optional list of models to filter (show only these models)
+        Returns:
+            List of model stats sorted by ELO rating (descending)
+        """
+        info = self.get_subset_info(subset)
+        if not info or not info.state:
+            return []
+        state = info.state
+        leaderboard = []
+        for model_name, model_stats in state.models.items():
+            # Apply filter if specified
+            if filter_models and model_name not in filter_models:
+                continue
+            leaderboard.append({
+                "model": model_name,
+                "elo": model_stats.elo,
+                "wins": model_stats.wins,
+                "losses": model_stats.losses,
+                "ties": model_stats.ties,
+                "total_battles": model_stats.total_battles,
+                "win_rate": model_stats.win_rate,
+            })
+        # Sort by ELO rating (descending)
+        leaderboard.sort(key=lambda x: -x["elo"])
+        # Add rank
+        for i, entry in enumerate(leaderboard):
+            entry["rank"] = i + 1
+        return leaderboard
+    def get_model_vs_stats(
+        self,
+        subset: str,
+        model: str,
+        exp_name: str = "__all__",
+    ) -> dict[str, Any]:
+        """
+        Get win/loss/tie stats of a specific model against all other models.
+        Args:
+            subset: Subset name
+            model: Target model name
+            exp_name: Experiment name (default "__all__" for all experiments)
+        Returns:
+            Dict with model stats and versus stats against each opponent
+        """
+        # Get overall ELO stats
+        info = self.get_subset_info(subset)
+        if not info or not info.state:
+            return {}
+        state = info.state
+        if model not in state.models:
+            return {}
+        model_stats = state.models[model]
+        # Load battle records
+        if exp_name == "__all__":
+            all_records = self._load_all_experiments_battles(subset)
+        else:
+            all_records = self._load_battle_logs(subset, exp_name)
+        # Calculate stats against each opponent
+        vs_stats: dict[str, dict[str, int]] = {}
+        for record in all_records:
+            if model not in [record.model_a, record.model_b]:
+                continue
+            opponent = record.model_b if record.model_a == model else record.model_a
+            if opponent not in vs_stats:
+                vs_stats[opponent] = {"wins": 0, "losses": 0, "ties": 0}
+            if record.final_winner == "tie":
+                vs_stats[opponent]["ties"] += 1
+            elif record.final_winner == model:
+                vs_stats[opponent]["wins"] += 1
+            else:
+                vs_stats[opponent]["losses"] += 1
+        # Convert to list with win rates and opponent ELO
+        vs_list = []
+        for opponent, stats in vs_stats.items():
+            total = stats["wins"] + stats["losses"] + stats["ties"]
+            opponent_elo = state.models[opponent].elo if opponent in state.models else 1000.0
+            vs_list.append({
+                "opponent": opponent,
+                "opponent_elo": opponent_elo,
+                "wins": stats["wins"],
+                "losses": stats["losses"],
+                "ties": stats["ties"],
+                "total": total,
+                "win_rate": stats["wins"] / total if total > 0 else 0,
+            })
+        # Sort by opponent ELO (descending)
+        vs_list.sort(key=lambda x: -x["opponent_elo"])
+        return {
+            "model": model,
+            "elo": model_stats.elo,
+            "wins": model_stats.wins,
+            "losses": model_stats.losses,
+            "ties": model_stats.ties,
+            "total_battles": model_stats.total_battles,
+            "win_rate": model_stats.win_rate,
+            "vs_stats": vs_list,
+        }
+    def get_all_subsets_leaderboards(self) -> dict[str, Any]:
+        """
+        Get leaderboard data for all subsets (for Overview page).
+        Returns:
+            Dict with:
+            - subsets: List of subset names
+            - models: List of all unique model names across all subsets
+            - data: Dict mapping subset -> {model -> {elo, rank, wins, losses, ties, ...}}
+            - subset_info: Dict mapping subset -> {total_battles, model_count}
+        """
+        subsets = self.discover_subsets()
+        all_models: set[str] = set()
+        data: dict[str, dict[str, dict[str, Any]]] = {}
+        subset_info: dict[str, dict[str, Any]] = {}
+        for subset in subsets:
+            leaderboard = self.get_elo_leaderboard(subset)
+            info = self.get_subset_info(subset)
+            if not leaderboard:
+                continue
+            # Build subset data
+            subset_data: dict[str, dict[str, Any]] = {}
+            for entry in leaderboard:
+                model = entry["model"]
+                all_models.add(model)
+                subset_data[model] = {
+                    "elo": entry["elo"],
+                    "rank": entry["rank"],
+                    "wins": entry["wins"],
+                    "losses": entry["losses"],
+                    "ties": entry["ties"],
+                    "total_battles": entry["total_battles"],
+                    "win_rate": entry["win_rate"],
+                }
+            data[subset] = subset_data
+            subset_info[subset] = {
+                "total_battles": info.total_battles if info else 0,
+                "model_count": len(leaderboard),
+            }
+        # Sort models by average ELO across all subsets (descending)
+        model_avg_elo: dict[str, tuple[float, int]] = {}  # model -> (sum_elo, count)
+        for model in all_models:
+            total_elo = 0.0
+            count = 0
+            for subset in subsets:
+                if subset in data and model in data[subset]:
+                    total_elo += data[subset][model]["elo"]
+                    count += 1
+            if count > 0:
+                model_avg_elo[model] = (total_elo / count, count)
+            else:
+                model_avg_elo[model] = (0.0, 0)
+        sorted_models = sorted(
+            all_models,
+            key=lambda m: (-model_avg_elo[m][0], -model_avg_elo[m][1], m)
+        )
+        return {
+            "subsets": subsets,
+            "models": sorted_models,
+            "data": data,
+            "subset_info": subset_info,
+        }
+    def get_prompts(
+        self,
+        subset: str,
+        exp_name: str,
+        page: int = 1,
+        page_size: int = 10,
+        min_images: Optional[int] = None,
+        max_images: Optional[int] = None,
+        prompt_source: Optional[str] = None,
+        filter_models: Optional[list[str]] = None,
+    ) -> tuple[list[dict[str, Any]], int]:
+        """
+        Get paginated list of prompts/samples with all model outputs.
+        Args:
+            subset: Subset name
+            exp_name: Experiment name (use "__all__" for all experiments)
+            page: Page number (1-indexed)
+            page_size: Number of records per page
+            min_images: Minimum number of input images
+            max_images: Maximum number of input images
+            prompt_source: Filter by prompt source
+            filter_models: Optional list of models to filter (show only these models)
+        Returns:
+            Tuple of (prompts_list, total_count)
+        """
+        # Get all sample indices from battle logs
+        if exp_name == "__all__":
+            all_records = self._load_all_experiments_battles(subset)
+        else:
+            all_records = self._load_battle_logs(subset, exp_name)
+        # Collect unique sample indices
+        sample_indices = set()
+        for record in all_records:
+            sample_indices.add(record.sample_index)
+        # Sort sample indices
+        sorted_indices = sorted(sample_indices)
+        # Apply filters
+        filtered_indices = []
+        for idx in sorted_indices:
+            sample_meta = self._get_sample_data(subset, idx)
+            img_count = sample_meta.get("input_image_count", 1)
+            source = sample_meta.get("prompt_source")
+            # Apply image count filter
+            if min_images is not None and img_count < min_images:
+                continue
+            if max_images is not None and img_count > max_images:
+                continue
+            # Apply prompt source filter
+            if prompt_source and source != prompt_source:
+                continue
+            filtered_indices.append(idx)
+        total_count = len(filtered_indices)
+        # Paginate
+        start = (page - 1) * page_size
+        end = start + page_size
+        page_indices = filtered_indices[start:end]
+        # Build prompt data for each sample
+        prompts = []
+        for idx in page_indices:
+            prompt_data = self.get_sample_all_models(subset, exp_name, idx, filter_models)
+            prompts.append(prompt_data)
+        return prompts, total_count
+class HFArenaDataLoader(ArenaDataLoader):
+    """
+    Data loader for HuggingFace Spaces deployment.
+    Extends ArenaDataLoader to:
+    - Build image URL index from HF file list
+    - Return HF CDN URLs for model output images instead of local paths
+    """
+    def __init__(
+        self,
+        arena_dir: str,
+        data_dir: str,
+        hf_repo: str,
+        image_files: list[str],
+        preload: bool = True,
+    ):
+        """
+        Initialize the HF data loader.
+        Args:
+            arena_dir: Path to arena directory (metadata only, no images)
+            data_dir: Path to data directory containing parquet files
+            hf_repo: HuggingFace repo ID for image CDN URLs
+            image_files: List of image file paths in the HF repo
+            preload: If True, preload all data at initialization
+        """
+        self.hf_repo = hf_repo
+        self._image_url_index = self._build_image_index(image_files)
+        super().__init__(arena_dir, data_dir, preload=preload)
+    def _build_image_index(
+        self, image_files: list[str]
+    ) -> dict[tuple[str, str, int], str]:
+        """
+        Build index: (subset, model, sample_index) -> hf_file_path
+        Expected path format: {subset}/models/{exp_name}/{model}/{index}.png
+        Args:
+            image_files: List of image file paths from HF repo
+        Returns:
+            Dict mapping (subset, model, sample_index) to HF file path
+        """
+        from genarena.models import parse_image_index
+        index: dict[tuple[str, str, int], str] = {}
+        for path in image_files:
+            parts = path.split("/")
+            # Expected: subset/models/exp_name/model/000000.png
+            if len(parts) >= 5 and parts[1] == "models":
+                subset = parts[0]
+                # exp_name = parts[2]  # Not needed for lookup
+                model = parts[3]
+                filename = parts[4]
+                idx = parse_image_index(filename)
+                if idx is not None:
+                    # If duplicate, later entries overwrite earlier ones
+                    index[(subset, model, idx)] = path
+        logger.info(f"Built image URL index with {len(index)} entries")
+        return index
+    def get_model_image_url(
+        self, subset: str, model: str, sample_index: int
+    ) -> Optional[str]:
+        """
+        Get HF CDN URL for model output image.
+        Args:
+            subset: Subset name
+            model: Model name
+            sample_index: Sample index
+        Returns:
+            HF CDN URL or None if not found
+        """
+        path = self._image_url_index.get((subset, model, sample_index))
+        if path:
+            return f"https://huggingface.co/datasets/{self.hf_repo}/resolve/main/{path}"
+        return None
+    def get_image_path(
+        self, subset: str, model: str, sample_index: int
+    ) -> Optional[str]:
+        """
+        Override to return None since images are served via CDN.
+        For HF deployment, use get_model_image_url() instead.
+        """
+        # Return None to indicate image should be fetched via CDN
+        return None

genarena/visualize/static/app.js ADDED Viewed

The diff for this file is too large to render. See raw diff

genarena/visualize/static/style.css ADDED Viewed

	@@ -0,0 +1,4104 @@

+/* ========== CSS Variables ========== */
+:root {
+    --bg-primary: #0d1117;
+    --bg-secondary: #161b22;
+    --bg-tertiary: #21262d;
+    --bg-hover: #30363d;
+    --border-color: #30363d;
+    --border-light: #484f58;
+    --text-primary: #e6edf3;
+    --text-secondary: #8b949e;
+    --text-muted: #6e7681;
+    --accent-blue: #58a6ff;
+    --accent-green: #3fb950;
+    --accent-red: #f85149;
+    --accent-yellow: #d29922;
+    --accent-purple: #a371f7;
+    --font-mono: 'JetBrains Mono', 'Fira Code', 'Cascadia Code', monospace;
+    --font-sans: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+    --radius-sm: 4px;
+    --radius-md: 8px;
+    --radius-lg: 12px;
+    --shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.3);
+    --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.4);
+    --shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.5);
+}
+/* ========== Reset & Base ========== */
+* {
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+}
+body {
+    font-family: var(--font-sans);
+    background: var(--bg-primary);
+    color: var(--text-primary);
+    line-height: 1.6;
+    min-height: 100vh;
+}
+/* ========== Header ========== */
+.header {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    padding: 12px 24px;
+    background: var(--bg-secondary);
+    border-bottom: 1px solid var(--border-color);
+    position: sticky;
+    top: 0;
+    z-index: 100;
+}
+.header-left {
+    display: flex;
+    align-items: center;
+    gap: 32px;
+    flex: 0 0 auto;
+}
+.logo {
+    font-size: 1.25rem;
+    font-weight: 700;
+    color: var(--accent-blue);
+    letter-spacing: -0.02em;
+}
+.header-right {
+    flex: 0 0 auto;
+}
+.selector-group {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.selector-group label {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+}
+.selector {
+    padding: 6px 12px;
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    color: var(--text-primary);
+    font-size: 0.875rem;
+    cursor: pointer;
+    min-width: 160px;
+}
+.selector:hover:not(:disabled) {
+    border-color: var(--border-light);
+}
+.selector:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+}
+.stats-badge {
+    font-size: 0.75rem;
+    padding: 4px 10px;
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-sm);
+    color: var(--text-secondary);
+    font-family: var(--font-mono);
+}
+/* Favorites Button */
+.header-right {
+    display: flex;
+    align-items: center;
+    gap: 12px;
+}
+.btn-favorites {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    padding: 6px 12px;
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    color: var(--accent-yellow);
+    cursor: pointer;
+    transition: all 0.15s ease;
+}
+.btn-favorites:hover {
+    background: var(--bg-hover);
+    border-color: var(--accent-yellow);
+}
+.favorites-icon {
+    font-size: 1rem;
+}
+.favorites-count {
+    font-size: 0.75rem;
+    font-family: var(--font-mono);
+    background: var(--bg-primary);
+    padding: 2px 6px;
+    border-radius: var(--radius-sm);
+    color: var(--text-secondary);
+}
+/* View Toggle */
+.view-toggle {
+    display: flex;
+    gap: 4px;
+    background: var(--bg-tertiary);
+    padding: 4px;
+    border-radius: var(--radius-sm);
+    border: 1px solid var(--border-color);
+}
+.view-btn {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    padding: 6px 12px;
+    background: transparent;
+    border: none;
+    border-radius: var(--radius-sm);
+    color: var(--text-secondary);
+    font-size: 0.8125rem;
+    cursor: pointer;
+    transition: all 0.15s ease;
+}
+.view-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-hover);
+}
+.view-btn.active {
+    background: var(--accent-blue);
+    color: #fff;
+}
+.view-icon {
+    font-size: 0.875rem;
+}
+/* ========== Main Layout ========== */
+.main-container {
+    display: flex;
+    min-height: calc(100vh - 57px);
+}
+/* ========== Sidebar ========== */
+.sidebar {
+    width: 280px;
+    flex-shrink: 0;
+    background: var(--bg-secondary);
+    border-right: 1px solid var(--border-color);
+    padding: 20px;
+    display: flex;
+    flex-direction: column;
+    gap: 24px;
+}
+.filter-section h3,
+.stats-section h3 {
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: var(--text-muted);
+    margin-bottom: 12px;
+}
+.filter-group {
+    margin-bottom: 12px;
+}
+.filter-group label {
+    display: block;
+    font-size: 0.8125rem;
+    color: var(--text-secondary);
+    margin-bottom: 4px;
+}
+.filter-select {
+    width: 100%;
+    padding: 8px 10px;
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    color: var(--text-primary);
+    font-size: 0.875rem;
+}
+/* Checkbox group for multi-select models */
+.checkbox-group {
+    max-height: 200px;
+    overflow-y: auto;
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    padding: 8px;
+    margin-bottom: 8px;
+}
+.checkbox-item {
+    display: flex;
+    align-items: center;
+    padding: 4px 0;
+    cursor: pointer;
+}
+.checkbox-item:hover {
+    background: var(--bg-hover);
+    margin: 0 -8px;
+    padding: 4px 8px;
+}
+.checkbox-item input[type="checkbox"] {
+    margin-right: 8px;
+    accent-color: var(--accent-blue);
+    cursor: pointer;
+}
+.checkbox-item label {
+    font-size: 0.8125rem;
+    color: var(--text-primary);
+    cursor: pointer;
+    flex: 1;
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+}
+.checkbox-actions {
+    display: flex;
+    gap: 4px;
+}
+.checkbox-actions .btn-small {
+    flex: 1;
+    font-size: 0.75rem;
+}
+#model-count {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+#prompts-model-count, #favorites-model-count {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+.filter-hint {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+    margin-top: 8px;
+    font-style: italic;
+}
+/* Range Slider for image count filter */
+.range-slider-container {
+    position: relative;
+    height: 30px;
+    margin: 8px 0;
+}
+.range-slider {
+    position: absolute;
+    width: 100%;
+    height: 4px;
+    background: transparent;
+    -webkit-appearance: none;
+    appearance: none;
+    pointer-events: none;
+    top: 50%;
+    transform: translateY(-50%);
+}
+.range-slider::-webkit-slider-runnable-track {
+    height: 4px;
+    background: var(--bg-tertiary);
+    border-radius: 2px;
+}
+.range-slider::-webkit-slider-thumb {
+    -webkit-appearance: none;
+    appearance: none;
+    width: 16px;
+    height: 16px;
+    background: var(--accent-blue);
+    border-radius: 50%;
+    cursor: pointer;
+    pointer-events: auto;
+    margin-top: -6px;
+}
+.range-slider::-moz-range-track {
+    height: 4px;
+    background: var(--bg-tertiary);
+    border-radius: 2px;
+}
+.range-slider::-moz-range-thumb {
+    width: 16px;
+    height: 16px;
+    background: var(--accent-blue);
+    border: none;
+    border-radius: 50%;
+    cursor: pointer;
+    pointer-events: auto;
+}
+.range-labels {
+    display: flex;
+    justify-content: space-between;
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+#image-range-display {
+    font-family: var(--font-mono);
+    color: var(--accent-blue);
+}
+.btn {
+    padding: 8px 16px;
+    border: none;
+    border-radius: var(--radius-sm);
+    font-size: 0.875rem;
+    font-weight: 500;
+    cursor: pointer;
+    transition: all 0.15s ease;
+}
+.btn-primary {
+    background: var(--accent-blue);
+    color: #fff;
+}
+.btn-primary:hover {
+    background: #4c9aed;
+}
+.btn-secondary {
+    background: var(--bg-tertiary);
+    color: var(--text-secondary);
+    border: 1px solid var(--border-color);
+}
+.btn-secondary:hover {
+    background: var(--bg-hover);
+    color: var(--text-primary);
+}
+.btn-small {
+    padding: 4px 10px;
+    font-size: 0.8125rem;
+}
+.btn:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+}
+.filter-section .btn {
+    width: 100%;
+    margin-top: 8px;
+}
+.filter-section .btn-secondary {
+    margin-top: 4px;
+}
+#stats-panel {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+}
+#stats-panel .stat-item {
+    display: flex;
+    justify-content: space-between;
+    padding: 6px 0;
+    border-bottom: 1px solid var(--border-color);
+}
+#stats-panel .stat-item:last-child {
+    border-bottom: none;
+}
+#stats-panel .stat-label {
+    color: var(--text-muted);
+}
+#stats-panel .stat-value {
+    color: var(--text-primary);
+    font-family: var(--font-mono);
+}
+.placeholder {
+    color: var(--text-muted);
+    font-style: italic;
+}
+/* Head-to-Head Section */
+.h2h-section {
+    margin-top: 16px;
+    padding-top: 16px;
+    border-top: 1px solid var(--border-color);
+}
+.h2h-section h3 {
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: var(--text-muted);
+    margin-bottom: 12px;
+}
+/* ELO Leaderboard Section */
+.elo-section {
+    margin-top: 16px;
+    padding-top: 16px;
+    border-top: 1px solid var(--border-color);
+}
+.elo-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: 12px;
+}
+.elo-header h3 {
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: var(--text-muted);
+    margin: 0;
+}
+.btn-link {
+    background: transparent;
+    border: none;
+    color: var(--accent-blue);
+    font-size: 0.75rem;
+    cursor: pointer;
+    padding: 2px 4px;
+}
+.btn-link:hover {
+    text-decoration: underline;
+}
+#elo-panel {
+    /* No height limit - show all models */
+}
+/* ELO Bar Chart Item */
+.elo-item {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    margin-bottom: 8px;
+    cursor: pointer;
+    padding: 4px;
+    border-radius: var(--radius-sm);
+    transition: background 0.15s ease;
+}
+.elo-item:hover {
+    background: var(--bg-hover);
+}
+.elo-rank {
+    font-size: 0.75rem;
+    font-weight: 600;
+    color: var(--text-muted);
+    min-width: 24px;
+    text-align: center;
+}
+.elo-rank.rank-1 {
+    color: var(--accent-yellow);
+}
+.elo-rank.rank-2 {
+    color: #c0c0c0;
+}
+.elo-rank.rank-3 {
+    color: #cd7f32;
+}
+.elo-model-name {
+    font-size: 0.75rem;
+    color: var(--text-secondary);
+    flex-shrink: 0;
+    width: 80px;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+}
+.elo-bar-container {
+    flex: 1;
+    height: 16px;
+    background: var(--bg-tertiary);
+    border-radius: 2px;
+    overflow: hidden;
+    position: relative;
+}
+.elo-bar {
+    height: 100%;
+    background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple));
+    border-radius: 2px;
+    transition: width 0.3s ease;
+}
+.elo-value {
+    font-size: 0.6875rem;
+    font-family: var(--font-mono);
+    color: var(--text-primary);
+    min-width: 36px;
+    text-align: right;
+}
+/* Leaderboard Modal */
+.leaderboard-modal-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 20px;
+    border-bottom: 1px solid var(--border-color);
+}
+.leaderboard-modal-header h2 {
+    font-size: 1.25rem;
+    color: var(--text-primary);
+    margin: 0;
+}
+.subset-badge {
+    font-size: 0.875rem;
+    padding: 4px 12px;
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-sm);
+    color: var(--accent-blue);
+}
+#leaderboard-content {
+    padding: 20px;
+    max-height: calc(90vh - 80px);
+    overflow-y: auto;
+}
+/* Leaderboard Table */
+.leaderboard-table {
+    width: 100%;
+    border-collapse: collapse;
+}
+.leaderboard-table th,
+.leaderboard-table td {
+    padding: 12px 16px;
+    text-align: left;
+    border-bottom: 1px solid var(--border-color);
+}
+.leaderboard-table th {
+    background: var(--bg-tertiary);
+    color: var(--text-muted);
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    font-weight: 600;
+    position: sticky;
+    top: 0;
+}
+.leaderboard-table td {
+    font-size: 0.875rem;
+    color: var(--text-primary);
+}
+.leaderboard-table tbody tr {
+    cursor: pointer;
+    transition: background 0.15s ease;
+}
+.leaderboard-table tbody tr:hover {
+    background: var(--bg-hover);
+}
+.leaderboard-table .rank-cell {
+    font-weight: 600;
+    text-align: center;
+    width: 60px;
+}
+.leaderboard-table .rank-cell.rank-1 {
+    color: var(--accent-yellow);
+}
+.leaderboard-table .rank-cell.rank-2 {
+    color: #c0c0c0;
+}
+.leaderboard-table .rank-cell.rank-3 {
+    color: #cd7f32;
+}
+.leaderboard-table .model-cell {
+    font-family: var(--font-mono);
+}
+.leaderboard-table .elo-cell {
+    font-family: var(--font-mono);
+    font-weight: 600;
+    color: var(--accent-blue);
+}
+.leaderboard-table .stat-cell {
+    font-family: var(--font-mono);
+    color: var(--text-secondary);
+}
+.leaderboard-table .win-rate-cell {
+    width: 120px;
+}
+.win-rate-bar {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.win-rate-bar-bg {
+    flex: 1;
+    height: 6px;
+    background: var(--bg-primary);
+    border-radius: 3px;
+    overflow: hidden;
+}
+.win-rate-bar-fill {
+    height: 100%;
+    background: var(--accent-green);
+    border-radius: 3px;
+}
+.win-rate-text {
+    font-family: var(--font-mono);
+    font-size: 0.75rem;
+    color: var(--accent-green);
+    min-width: 40px;
+    text-align: right;
+}
+/* Model Stats Modal */
+#model-stats-content {
+    padding: 20px;
+}
+.model-stats-header {
+    margin-bottom: 24px;
+    padding-bottom: 16px;
+    border-bottom: 1px solid var(--border-color);
+}
+.model-stats-header h2 {
+    font-size: 1.25rem;
+    margin: 0 0 8px 0;
+    font-family: var(--font-mono);
+}
+.model-stats-summary {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 16px;
+}
+.model-stat-item {
+    background: var(--bg-tertiary);
+    padding: 12px 16px;
+    border-radius: var(--radius-sm);
+    text-align: center;
+    min-width: 100px;
+}
+.model-stat-item .stat-label {
+    font-size: 0.6875rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: var(--text-muted);
+    margin-bottom: 4px;
+}
+.model-stat-item .stat-value {
+    font-size: 1.25rem;
+    font-weight: 600;
+    font-family: var(--font-mono);
+    text-align: center;
+}
+.model-stat-item .stat-value.elo-value {
+    color: var(--accent-blue);
+}
+.model-stat-item .stat-value.wins-value {
+    color: var(--accent-green);
+}
+.model-stat-item .stat-value.losses-value {
+    color: var(--accent-red);
+}
+.model-stat-item .stat-value.ties-value {
+    color: var(--accent-yellow);
+}
+.vs-stats-section h3 {
+    font-size: 0.875rem;
+    color: var(--text-muted);
+    margin-bottom: 16px;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+}
+.vs-stats-table {
+    width: 100%;
+    border-collapse: collapse;
+}
+.vs-stats-table th,
+.vs-stats-table td {
+    padding: 10px 12px;
+    text-align: left;
+    border-bottom: 1px solid var(--border-color);
+}
+.vs-stats-table th {
+    background: var(--bg-tertiary);
+    color: var(--text-muted);
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    font-weight: 600;
+}
+.vs-stats-table td {
+    font-size: 0.875rem;
+    color: var(--text-primary);
+}
+.vs-stats-table tbody tr:hover {
+    background: var(--bg-hover);
+}
+.vs-stats-table .opponent-cell {
+    font-family: var(--font-mono);
+}
+.vs-stats-table .opponent-elo {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+    margin-left: 8px;
+}
+.vs-stats-table .wlt-cell {
+    font-family: var(--font-mono);
+}
+.vs-stats-table .wlt-cell .wins {
+    color: var(--accent-green);
+}
+.vs-stats-table .wlt-cell .losses {
+    color: var(--accent-red);
+}
+.vs-stats-table .wlt-cell .ties {
+    color: var(--accent-yellow);
+}
+.h2h-bar {
+    display: flex;
+    height: 24px;
+    border-radius: var(--radius-sm);
+    overflow: hidden;
+    margin: 8px 0;
+}
+.h2h-bar-a {
+    background: var(--accent-green);
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 0.75rem;
+    font-weight: 600;
+    color: #000;
+    min-width: 30px;
+}
+.h2h-bar-tie {
+    background: var(--accent-yellow);
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 0.75rem;
+    font-weight: 600;
+    color: #000;
+    min-width: 20px;
+}
+.h2h-bar-b {
+    background: var(--accent-red);
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 0.75rem;
+    font-weight: 600;
+    color: #fff;
+    min-width: 30px;
+}
+.h2h-labels {
+    display: flex;
+    justify-content: space-between;
+    font-size: 0.75rem;
+    color: var(--text-secondary);
+    margin-bottom: 4px;
+}
+.h2h-label {
+    max-width: 45%;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+}
+.h2h-stats-row {
+    display: flex;
+    justify-content: space-between;
+    font-size: 0.8125rem;
+    padding: 4px 0;
+}
+.h2h-stats-row .value {
+    font-family: var(--font-mono);
+    color: var(--text-primary);
+}
+/* ========== Content Area ========== */
+.content {
+    flex: 1;
+    padding: 20px;
+    overflow-y: auto;
+}
+.content-header,
+.content-footer {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: 16px;
+}
+.content-footer {
+    margin-top: 16px;
+    margin-bottom: 0;
+    justify-content: flex-end;
+}
+.pagination-info {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+}
+.pagination-controls {
+    display: flex;
+    align-items: center;
+    gap: 4px;
+}
+.page-numbers {
+    display: flex;
+    align-items: center;
+    gap: 2px;
+}
+.page-number {
+    min-width: 32px;
+    height: 28px;
+    padding: 0 8px;
+    border: none;
+    border-radius: var(--radius-sm);
+    background: var(--bg-tertiary);
+    color: var(--text-secondary);
+    font-size: 0.8125rem;
+    cursor: pointer;
+    transition: all 0.15s ease;
+}
+.page-number:hover {
+    background: var(--bg-hover);
+    color: var(--text-primary);
+}
+.page-number.active {
+    background: var(--accent-blue);
+    color: #fff;
+}
+.page-number.ellipsis {
+    background: transparent;
+    cursor: default;
+    color: var(--text-muted);
+}
+.page-number.ellipsis:hover {
+    background: transparent;
+    color: var(--text-muted);
+}
+.page-jump {
+    display: flex;
+    align-items: center;
+    gap: 4px;
+    margin-left: 8px;
+}
+.page-input {
+    width: 60px;
+    height: 28px;
+    padding: 0 8px;
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    background: var(--bg-tertiary);
+    color: var(--text-primary);
+    font-size: 0.8125rem;
+    text-align: center;
+}
+.page-input:focus {
+    outline: none;
+    border-color: var(--accent-blue);
+}
+.page-input::-webkit-outer-spin-button,
+.page-input::-webkit-inner-spin-button {
+    -webkit-appearance: none;
+    margin: 0;
+}
+.page-input[type=number] {
+    -moz-appearance: textfield;
+}
+/* ========== Battle List ========== */
+.battle-list {
+    display: flex;
+    flex-direction: column;
+    gap: 12px;
+}
+.empty-state {
+    text-align: center;
+    padding: 60px 20px;
+    color: var(--text-muted);
+}
+/* ========== Battle Card ========== */
+.battle-card {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-md);
+    padding: 16px;
+    cursor: pointer;
+    transition: all 0.15s ease;
+}
+.battle-card:hover {
+    border-color: var(--border-light);
+    background: var(--bg-tertiary);
+}
+.battle-card-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: flex-start;
+    margin-bottom: 12px;
+}
+.battle-models {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    flex-wrap: wrap;
+}
+.model-name {
+    font-family: var(--font-mono);
+    font-size: 0.875rem;
+    padding: 2px 8px;
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-sm);
+}
+.model-name.winner {
+    background: rgba(63, 185, 80, 0.2);
+    color: var(--accent-green);
+}
+.model-name.loser {
+    background: rgba(248, 81, 73, 0.15);
+    color: var(--accent-red);
+}
+.vs-label {
+    color: var(--text-muted);
+    font-size: 0.75rem;
+}
+.battle-badges {
+    display: flex;
+    gap: 6px;
+}
+.badge {
+    display: inline-flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 0.6875rem;
+    padding: 2px 6px;
+    border-radius: var(--radius-sm);
+    text-transform: uppercase;
+    font-weight: 600;
+    letter-spacing: 0.02em;
+    line-height: 1;
+}
+.badge-win {
+    background: rgba(63, 185, 80, 0.2);
+    color: var(--accent-green);
+}
+.badge-loss {
+    background: rgba(248, 81, 73, 0.15);
+    color: var(--accent-red);
+}
+.badge-tie {
+    background: rgba(210, 153, 34, 0.2);
+    color: var(--accent-yellow);
+}
+.badge-consistent {
+    background: rgba(88, 166, 255, 0.15);
+    color: var(--accent-blue);
+}
+.badge-inconsistent {
+    background: rgba(163, 113, 247, 0.15);
+    color: var(--accent-purple);
+}
+.battle-instruction {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+    margin-bottom: 12px;
+    display: -webkit-box;
+    -webkit-line-clamp: 2;
+    -webkit-box-orient: vertical;
+    overflow: hidden;
+}
+.battle-images {
+    display: grid;
+    grid-template-columns: repeat(3, 1fr);
+    gap: 8px;
+}
+.battle-image-container {
+    position: relative;
+    aspect-ratio: 1;
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-sm);
+    overflow: hidden;
+}
+.battle-image-container img {
+    width: 100%;
+    height: 100%;
+    object-fit: cover;
+}
+/* Multiple input images grid */
+.battle-image-container.multi-input {
+    display: flex;
+    flex-direction: column;
+}
+.input-thumbs-grid {
+    flex: 1;
+    display: grid;
+    gap: 2px;
+    padding: 2px;
+    min-height: 0; /* Prevent grid from growing based on content */
+}
+/* Dynamic grid based on image count */
+.battle-image-container.multi-input[data-count="2"] .input-thumbs-grid {
+    grid-template-columns: repeat(2, 1fr);
+}
+.battle-image-container.multi-input[data-count="3"] .input-thumbs-grid,
+.battle-image-container.multi-input[data-count="4"] .input-thumbs-grid {
+    grid-template-columns: 1fr 1fr;
+    grid-template-rows: 1fr 1fr;
+}
+.battle-image-container.multi-input[data-count="3"] .input-thumb,
+.battle-image-container.multi-input[data-count="4"] .input-thumb {
+    aspect-ratio: 1;
+    min-height: 0;
+    min-width: 0;
+}
+.battle-image-container.multi-input[data-count="5"] .input-thumbs-grid,
+.battle-image-container.multi-input[data-count="6"] .input-thumbs-grid {
+    grid-template-columns: repeat(3, 1fr);
+    grid-template-rows: repeat(2, 1fr);
+}
+.battle-image-container.multi-input[data-count="7"] .input-thumbs-grid,
+.battle-image-container.multi-input[data-count="8"] .input-thumbs-grid,
+.battle-image-container.multi-input[data-count="9"] .input-thumbs-grid {
+    grid-template-columns: repeat(3, 1fr);
+    grid-template-rows: repeat(3, 1fr);
+}
+/* Fallback for 10+ images */
+.battle-image-container.multi-input .input-thumbs-grid {
+    grid-template-columns: repeat(auto-fit, minmax(40px, 1fr));
+}
+.input-thumb {
+    position: relative;
+    overflow: hidden;
+    border-radius: 2px;
+    background: var(--bg-primary);
+    aspect-ratio: 1;
+}
+.input-thumb img {
+    width: 100%;
+    height: 100%;
+    object-fit: cover;
+    object-position: center;
+}
+.image-label {
+    position: absolute;
+    bottom: 4px;
+    left: 4px;
+    font-size: 0.625rem;
+    padding: 2px 4px;
+    background: rgba(0, 0, 0, 0.7);
+    border-radius: 2px;
+    color: #fff;
+    z-index: 1;
+}
+.battle-meta {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 8px;
+    justify-content: space-between;
+    margin-top: 8px;
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+.battle-meta span {
+    white-space: nowrap;
+}
+/* ========== Modal ========== */
+.modal {
+    position: fixed;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    z-index: 1000;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+}
+.modal.hidden {
+    display: none;
+}
+.modal-backdrop {
+    position: absolute;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    background: rgba(0, 0, 0, 0.7);
+}
+.modal-content {
+    position: relative;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-lg);
+    max-width: 1200px;
+    max-height: 90vh;
+    width: 95%;
+    overflow-y: auto;
+    box-shadow: var(--shadow-lg);
+}
+.modal-content-wide {
+    max-width: 95vw;
+    width: 95vw;
+}
+.modal-close {
+    position: absolute;
+    top: 12px;
+    right: 12px;
+    width: 32px;
+    height: 32px;
+    border: none;
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-sm);
+    color: var(--text-secondary);
+    font-size: 1.25rem;
+    cursor: pointer;
+    z-index: 10;
+}
+.modal-close:hover {
+    background: var(--bg-hover);
+    color: var(--text-primary);
+}
+/* ========== Detail View ========== */
+.detail-header {
+    padding: 20px;
+    border-bottom: 1px solid var(--border-color);
+}
+.detail-header h2 {
+    font-size: 1.125rem;
+    margin-bottom: 8px;
+}
+.detail-meta-info {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 12px;
+    margin-top: 8px;
+    margin-bottom: 8px;
+}
+.meta-tag {
+    font-size: 0.8125rem;
+    color: var(--text-secondary);
+    background: var(--bg-tertiary);
+    padding: 4px 10px;
+    border-radius: var(--radius-sm);
+}
+.meta-tag strong {
+    color: var(--text-muted);
+    font-weight: 500;
+}
+.detail-instruction {
+    font-size: 0.9375rem;
+    color: var(--text-secondary);
+    background: var(--bg-tertiary);
+    padding: 12px;
+    border-radius: var(--radius-sm);
+    margin-top: 12px;
+}
+.detail-metadata-section {
+    margin-top: 16px;
+}
+.detail-metadata-section h4 {
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: var(--text-muted);
+    margin-bottom: 8px;
+}
+.metadata-json {
+    font-family: var(--font-mono);
+    font-size: 0.75rem;
+    line-height: 1.5;
+    white-space: pre-wrap;
+    word-break: break-word;
+    color: var(--text-secondary);
+    background: var(--bg-primary);
+    padding: 12px;
+    border-radius: var(--radius-sm);
+    max-height: 200px;
+    overflow-y: auto;
+    border: 1px solid var(--border-color);
+}
+.detail-images {
+    display: grid;
+    grid-template-columns: repeat(3, 1fr);
+    gap: 16px;
+    padding: 20px;
+    border-bottom: 1px solid var(--border-color);
+}
+.detail-image-box {
+    text-align: center;
+}
+.detail-image-box.input-image {
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-md);
+    padding: 12px;
+}
+.detail-image-box.output-image {
+    background: var(--bg-tertiary);
+    border: 2px solid var(--border-light);
+    border-radius: var(--radius-md);
+    padding: 12px;
+    position: relative;
+}
+.detail-image-box.output-image.winner {
+    border-color: var(--accent-green);
+    box-shadow: 0 0 12px rgba(63, 185, 80, 0.2);
+}
+.detail-image-box.output-image.loser {
+    border-color: var(--accent-red);
+    opacity: 0.85;
+}
+.detail-image-box h4 {
+    font-size: 0.8125rem;
+    color: var(--text-secondary);
+    margin-bottom: 8px;
+}
+.detail-image-box.output-image.winner h4 {
+    color: var(--accent-green);
+}
+.detail-image-box.output-image.loser h4 {
+    color: var(--accent-red);
+}
+.detail-image-box img {
+    width: 100%;
+    max-height: 400px;
+    object-fit: contain;
+    background: var(--bg-primary);
+    border-radius: var(--radius-sm);
+    cursor: zoom-in;
+    transition: transform 0.2s ease;
+}
+.detail-image-box img:hover {
+    opacity: 0.9;
+}
+/* Image Lightbox */
+.lightbox {
+    position: fixed;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    z-index: 2000;
+    background: rgba(0, 0, 0, 0.95);
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    cursor: zoom-out;
+    opacity: 0;
+    visibility: hidden;
+    transition: opacity 0.2s ease, visibility 0.2s ease;
+}
+.lightbox.active {
+    opacity: 1;
+    visibility: visible;
+}
+.lightbox img {
+    max-width: 95vw;
+    max-height: 95vh;
+    object-fit: contain;
+    border-radius: var(--radius-md);
+    box-shadow: var(--shadow-lg);
+}
+.lightbox-close {
+    position: absolute;
+    top: 20px;
+    right: 20px;
+    width: 40px;
+    height: 40px;
+    border: none;
+    background: var(--bg-tertiary);
+    border-radius: 50%;
+    color: var(--text-primary);
+    font-size: 1.5rem;
+    cursor: pointer;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+}
+.lightbox-close:hover {
+    background: var(--bg-hover);
+}
+.lightbox-label {
+    position: absolute;
+    bottom: 20px;
+    left: 50%;
+    transform: translateX(-50%);
+    background: var(--bg-tertiary);
+    padding: 8px 16px;
+    border-radius: var(--radius-sm);
+    color: var(--text-primary);
+    font-size: 0.875rem;
+}
+.detail-vlm-outputs {
+    padding: 20px;
+}
+.detail-vlm-outputs h3 {
+    font-size: 0.875rem;
+    color: var(--text-muted);
+    margin-bottom: 12px;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+}
+.vlm-call {
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-md);
+    padding: 16px;
+    margin-bottom: 16px;
+}
+.vlm-call h4 {
+    font-size: 0.8125rem;
+    color: var(--accent-blue);
+    margin-bottom: 8px;
+}
+.vlm-call-meta {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+    margin-bottom: 12px;
+}
+.vlm-response {
+    font-family: var(--font-mono);
+    font-size: 0.8125rem;
+    line-height: 1.5;
+    white-space: pre-wrap;
+    word-break: break-word;
+    color: var(--text-secondary);
+    background: var(--bg-primary);
+    padding: 12px;
+    border-radius: var(--radius-sm);
+    max-height: 300px;
+    overflow-y: auto;
+}
+/* ========== Loading State ========== */
+.loading {
+    text-align: center;
+    padding: 40px;
+    color: var(--text-muted);
+}
+.loading::after {
+    content: '';
+    display: inline-block;
+    width: 20px;
+    height: 20px;
+    border: 2px solid var(--border-color);
+    border-top-color: var(--accent-blue);
+    border-radius: 50%;
+    animation: spin 0.8s linear infinite;
+    margin-left: 8px;
+    vertical-align: middle;
+}
+@keyframes spin {
+    to { transform: rotate(360deg); }
+}
+/* ========== Favorites Modal ========== */
+.favorites-modal-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 20px;
+    border-bottom: 1px solid var(--border-color);
+}
+.favorites-modal-header h2 {
+    font-size: 1.25rem;
+    color: var(--text-primary);
+}
+/* Favorites Scrollable Container */
+.favorites-scrollable {
+    max-height: calc(90vh - 80px);
+    overflow-y: auto;
+    padding: 20px;
+}
+/* Favorites Model Filter - Horizontal Layout */
+.favorites-model-filter {
+    padding: 16px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-md);
+    margin-bottom: 20px;
+}
+.favorites-model-filter label {
+    display: block;
+    font-size: 0.875rem;
+    font-weight: 500;
+    margin-bottom: 12px;
+    color: var(--text-primary);
+}
+/* Horizontal checkbox group for favorites */
+.checkbox-group-horizontal {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 8px;
+    margin-bottom: 12px;
+}
+.checkbox-group-horizontal .checkbox-item {
+    display: inline-flex;
+    align-items: center;
+    padding: 6px 12px;
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    cursor: pointer;
+    transition: all 0.15s ease;
+    white-space: nowrap;
+}
+.checkbox-group-horizontal .checkbox-item:hover {
+    background: var(--bg-hover);
+    border-color: var(--accent-blue);
+}
+.checkbox-group-horizontal .checkbox-item.selected {
+    background: var(--accent-blue);
+    border-color: var(--accent-blue);
+    color: white;
+}
+.checkbox-group-horizontal .checkbox-item.selected .checkbox-label {
+    color: white;
+}
+.checkbox-group-horizontal .checkbox-item input[type="checkbox"] {
+    /* Hide the actual checkbox, use styling for selection state */
+    position: absolute;
+    opacity: 0;
+    width: 0;
+    height: 0;
+}
+.checkbox-group-horizontal .checkbox-item .checkbox-label {
+    font-size: 0.8125rem;
+    color: var(--text-primary);
+    cursor: pointer;
+    margin: 0;
+}
+/* Inline checkbox actions for favorites */
+.checkbox-actions-inline {
+    display: flex;
+    gap: 8px;
+    flex-wrap: wrap;
+}
+.checkbox-actions-inline .btn-small {
+    font-size: 0.75rem;
+}
+/* Filter controls row - buttons and toggle on same line */
+.filter-controls-row {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    flex-wrap: wrap;
+    gap: 12px;
+}
+/* Stats scope toggle */
+.stats-scope-toggle {
+    display: flex;
+    align-items: center;
+}
+.stats-scope-toggle .toggle-label {
+    display: flex;
+    align-items: center;
+    cursor: pointer;
+    font-size: 0.8125rem;
+    color: var(--text-secondary);
+    gap: 6px;
+}
+.stats-scope-toggle .toggle-label input[type="checkbox"] {
+    width: 16px;
+    height: 16px;
+    accent-color: var(--accent-blue);
+    cursor: pointer;
+}
+.stats-scope-toggle .toggle-text {
+    user-select: none;
+}
+.stats-scope-toggle .toggle-label:hover .toggle-text {
+    color: var(--text-primary);
+}
+#favorites-content {
+    /* padding removed as it's in the scrollable container now */
+}
+.favorites-empty {
+    text-align: center;
+    padding: 60px 20px;
+    color: var(--text-muted);
+}
+/* Favorite Prompt Card */
+.favorite-prompt-card {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-md);
+    margin-bottom: 24px;
+    overflow: hidden;
+}
+.favorite-prompt-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: flex-start;
+    padding: 16px;
+    border-bottom: 1px solid var(--border-color);
+    background: var(--bg-secondary);
+}
+.favorite-prompt-info {
+    flex: 1;
+}
+.favorite-prompt-instruction {
+    font-size: 0.9375rem;
+    color: var(--text-primary);
+    margin-bottom: 8px;
+    line-height: 1.5;
+}
+.favorite-prompt-meta {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 8px;
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+.favorite-prompt-meta span {
+    background: var(--bg-primary);
+    padding: 2px 8px;
+    border-radius: var(--radius-sm);
+}
+.favorite-prompt-actions {
+    display: flex;
+    gap: 8px;
+}
+.btn-unfavorite {
+    background: transparent;
+    border: 1px solid var(--accent-red);
+    color: var(--accent-red);
+    padding: 4px 12px;
+    border-radius: var(--radius-sm);
+    font-size: 0.75rem;
+    cursor: pointer;
+    transition: all 0.15s ease;
+}
+.btn-unfavorite:hover {
+    background: rgba(248, 81, 73, 0.15);
+}
+/* Favorite Input Images Section */
+.favorite-input-section {
+    padding: 12px 16px;
+    background: var(--bg-primary);
+    border-bottom: 1px solid var(--border-color);
+}
+.favorite-input-title {
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: var(--text-muted);
+    margin-bottom: 8px;
+}
+.favorite-input-images {
+    display: flex;
+    gap: 8px;
+    flex-wrap: wrap;
+}
+.favorite-input-image {
+    width: 80px;
+    height: 80px;
+    border-radius: var(--radius-sm);
+    overflow: hidden;
+    cursor: zoom-in;
+    border: 1px solid var(--border-color);
+}
+.favorite-input-image img {
+    width: 100%;
+    height: 100%;
+    object-fit: cover;
+}
+/* Favorite Models Grid */
+.favorite-models-section {
+    padding: 16px;
+}
+.favorite-models-title {
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: var(--text-muted);
+    margin-bottom: 12px;
+}
+.favorite-models-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
+    gap: 12px;
+}
+.favorite-model-card {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    overflow: hidden;
+    transition: all 0.15s ease;
+}
+.favorite-model-card:hover {
+    border-color: var(--border-light);
+}
+.favorite-model-card.rank-1 {
+    border-color: var(--accent-yellow);
+    box-shadow: 0 0 8px rgba(210, 153, 34, 0.2);
+}
+.favorite-model-card.rank-2 {
+    border-color: var(--text-muted);
+}
+.favorite-model-card.rank-3 {
+    border-color: #cd7f32;
+}
+.favorite-model-image {
+    aspect-ratio: 1;
+    width: 100%;
+    overflow: hidden;
+    cursor: zoom-in;
+    background: var(--bg-primary);
+}
+.favorite-model-image img {
+    width: 100%;
+    height: 100%;
+    object-fit: cover;
+    transition: transform 0.2s ease;
+}
+.favorite-model-image:hover img {
+    transform: scale(1.05);
+}
+.favorite-model-info {
+    padding: 8px;
+    border-top: 1px solid var(--border-color);
+}
+.favorite-model-name {
+    font-family: var(--font-mono);
+    font-size: 0.75rem;
+    color: var(--text-primary);
+    margin-bottom: 4px;
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+}
+.favorite-model-rank {
+    display: inline-block;
+    margin-right: 4px;
+    font-size: 0.6875rem;
+}
+.favorite-model-rank.rank-1 {
+    color: var(--accent-yellow);
+}
+.favorite-model-rank.rank-2 {
+    color: var(--text-muted);
+}
+.favorite-model-rank.rank-3 {
+    color: #cd7f32;
+}
+.favorite-model-stats {
+    font-size: 0.6875rem;
+    color: var(--text-muted);
+}
+.favorite-model-stats .win-rate {
+    color: var(--accent-green);
+    font-weight: 600;
+}
+.favorite-model-stats .wins {
+    color: var(--accent-green);
+}
+.favorite-model-stats .losses {
+    color: var(--accent-red);
+}
+.favorite-model-stats .ties {
+    color: var(--accent-yellow);
+}
+/* Battle Card Favorite Button */
+.battle-card-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: flex-start;
+    margin-bottom: 12px;
+}
+.btn-favorite-toggle {
+    background: transparent;
+    border: none;
+    font-size: 1.25rem;
+    cursor: pointer;
+    color: var(--text-muted);
+    padding: 4px;
+    line-height: 1;
+    transition: all 0.15s ease;
+}
+.btn-favorite-toggle:hover {
+    color: var(--accent-yellow);
+    transform: scale(1.1);
+}
+.btn-favorite-toggle.favorited {
+    color: var(--accent-yellow);
+}
+/* Favorites loading state */
+.favorite-loading {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    padding: 40px;
+    color: var(--text-muted);
+}
+.favorite-loading::after {
+    content: '';
+    display: block;
+    width: 30px;
+    height: 30px;
+    border: 3px solid var(--border-color);
+    border-top-color: var(--accent-blue);
+    border-radius: 50%;
+    animation: spin 0.8s linear infinite;
+    margin-top: 12px;
+}
+/* ========== Prompts View ========== */
+.prompts-list {
+    display: flex;
+    flex-direction: column;
+    gap: 24px;
+}
+/* Prompt Card - similar to favorite-prompt-card but standalone */
+.prompt-card {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-md);
+    overflow: hidden;
+    transition: all 0.15s ease;
+}
+.prompt-card:hover {
+    border-color: var(--border-light);
+}
+.prompt-card-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: flex-start;
+    padding: 16px;
+    border-bottom: 1px solid var(--border-color);
+    background: var(--bg-tertiary);
+}
+.prompt-card-info {
+    flex: 1;
+}
+.prompt-card-instruction {
+    font-size: 0.9375rem;
+    color: var(--text-primary);
+    margin-bottom: 8px;
+    line-height: 1.5;
+}
+.prompt-card-meta {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 8px;
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+.prompt-card-meta span {
+    background: var(--bg-primary);
+    padding: 2px 8px;
+    border-radius: var(--radius-sm);
+}
+.prompt-card-actions {
+    display: flex;
+    gap: 8px;
+}
+/* Prompt Input Images */
+.prompt-input-section {
+    padding: 12px 16px;
+    background: var(--bg-primary);
+    border-bottom: 1px solid var(--border-color);
+}
+.prompt-input-title {
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: var(--text-muted);
+    margin-bottom: 8px;
+}
+.prompt-input-images {
+    display: flex;
+    gap: 8px;
+    flex-wrap: wrap;
+}
+.prompt-input-image {
+    width: 100px;
+    height: 100px;
+    border-radius: var(--radius-sm);
+    overflow: hidden;
+    cursor: zoom-in;
+    border: 1px solid var(--border-color);
+    background: var(--bg-secondary);
+}
+.prompt-input-image img {
+    width: 100%;
+    height: 100%;
+    object-fit: cover;
+}
+/* Prompt Models Grid */
+.prompt-models-section {
+    padding: 16px;
+}
+.prompt-models-title {
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: var(--text-muted);
+    margin-bottom: 12px;
+}
+.prompt-models-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
+    gap: 12px;
+}
+.prompt-model-card {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    overflow: hidden;
+    transition: all 0.15s ease;
+}
+.prompt-model-card:hover {
+    border-color: var(--border-light);
+}
+.prompt-model-card.rank-1 {
+    border-color: var(--accent-yellow);
+    box-shadow: 0 0 8px rgba(210, 153, 34, 0.2);
+}
+.prompt-model-card.rank-2 {
+    border-color: var(--text-muted);
+}
+.prompt-model-card.rank-3 {
+    border-color: #cd7f32;
+}
+.prompt-model-image {
+    aspect-ratio: 1;
+    width: 100%;
+    overflow: hidden;
+    cursor: zoom-in;
+    background: var(--bg-primary);
+}
+.prompt-model-image img {
+    width: 100%;
+    height: 100%;
+    object-fit: cover;
+    transition: transform 0.2s ease;
+}
+.prompt-model-image:hover img {
+    transform: scale(1.05);
+}
+.prompt-model-info {
+    padding: 8px;
+    border-top: 1px solid var(--border-color);
+}
+.prompt-model-name {
+    font-family: var(--font-mono);
+    font-size: 0.75rem;
+    color: var(--text-primary);
+    margin-bottom: 4px;
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+}
+.prompt-model-rank {
+    display: inline-block;
+    margin-right: 4px;
+    font-size: 0.75rem;
+}
+.prompt-model-rank.rank-1 {
+    color: var(--accent-yellow);
+}
+.prompt-model-rank.rank-2 {
+    color: var(--text-muted);
+}
+.prompt-model-rank.rank-3 {
+    color: #cd7f32;
+}
+.prompt-model-stats {
+    font-size: 0.6875rem;
+    color: var(--text-muted);
+}
+.prompt-model-stats .win-rate {
+    color: var(--accent-green);
+    font-weight: 600;
+}
+.prompt-model-stats .wins {
+    color: var(--accent-green);
+}
+.prompt-model-stats .losses {
+    color: var(--accent-red);
+}
+.prompt-model-stats .ties {
+    color: var(--accent-yellow);
+}
+/* ========== Clickable Model Name ========== */
+.prompt-model-name.clickable {
+    cursor: pointer;
+    transition: color 0.2s ease;
+}
+.prompt-model-name.clickable:hover {
+    color: var(--accent-blue);
+    text-decoration: underline;
+}
+/* ========== Model Battles Modal ========== */
+.model-battles-modal {
+    max-height: 80vh;
+    overflow-y: auto;
+    padding: 20px;
+}
+.model-battles-header {
+    margin-bottom: 20px;
+    padding-bottom: 12px;
+    border-bottom: 1px solid var(--border-color);
+}
+.model-battles-header h2 {
+    margin: 0 0 4px 0;
+    font-size: 1.25rem;
+    color: var(--text-primary);
+}
+.model-battles-subtitle {
+    margin: 0 0 8px 0;
+    font-size: 0.875rem;
+    color: var(--text-muted);
+}
+.model-battles-stats {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 16px;
+    font-size: 0.875rem;
+}
+.model-battles-stats .stat-item {
+    color: var(--text-secondary);
+}
+.model-battles-filter {
+    background: var(--bg-card);
+    border-radius: 8px;
+    padding: 12px;
+    margin-bottom: 16px;
+}
+.model-battles-filter .filter-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: 12px;
+}
+.model-battles-filter .filter-header h4 {
+    margin: 0;
+    font-size: 0.875rem;
+    color: var(--text-primary);
+}
+.model-battles-filter .filter-actions {
+    display: flex;
+    gap: 8px;
+}
+.opponent-checkboxes {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 8px;
+    max-height: 150px;
+    overflow-y: auto;
+}
+.opponent-checkbox {
+    display: flex;
+    align-items: center;
+    gap: 4px;
+    background: var(--bg-main);
+    padding: 4px 8px;
+    border-radius: 4px;
+    font-size: 0.75rem;
+    cursor: pointer;
+    white-space: nowrap;
+}
+.opponent-checkbox input {
+    cursor: pointer;
+}
+.opponent-checkbox span {
+    color: var(--text-secondary);
+}
+.opponent-checkbox:hover {
+    background: var(--bg-hover);
+}
+.model-battles-list {
+    margin-top: 16px;
+}
+.model-battles-list h4 {
+    margin: 0 0 12px 0;
+    font-size: 0.875rem;
+    color: var(--text-primary);
+}
+/* Battle Record Cards */
+.battle-records-container {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+}
+.battle-record-card {
+    background: var(--bg-card);
+    border-radius: 8px;
+    border: 1px solid var(--border-color);
+    overflow: hidden;
+}
+.battle-record-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 12px 16px;
+    cursor: pointer;
+    transition: background-color 0.2s ease;
+}
+.battle-record-header:hover {
+    background: var(--bg-hover);
+}
+.battle-record-info {
+    display: flex;
+    align-items: center;
+    gap: 12px;
+    flex-wrap: wrap;
+}
+.battle-opponent {
+    font-weight: 500;
+    color: var(--text-primary);
+    font-size: 0.875rem;
+}
+.expand-icon {
+    color: var(--text-muted);
+    font-size: 0.75rem;
+    transition: transform 0.2s ease;
+}
+.battle-record-card.expanded .expand-icon {
+    transform: rotate(180deg);
+}
+/* Judge Outputs Section */
+.battle-card-judge-outputs {
+    display: none;
+    padding: 12px 16px 16px;
+    border-top: 1px solid var(--border-color);
+    background: var(--bg-main);
+}
+.battle-record-card.expanded .battle-card-judge-outputs {
+    display: block;
+}
+.judge-outputs-title {
+    font-size: 0.8125rem;
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 12px;
+}
+.judge-call {
+    background: var(--bg-card);
+    border-radius: 6px;
+    padding: 12px;
+    margin-bottom: 10px;
+}
+.judge-call:last-child {
+    margin-bottom: 0;
+}
+.judge-call-header {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    margin-bottom: 8px;
+}
+.judge-call-label {
+    font-size: 0.75rem;
+    font-weight: 600;
+    color: var(--accent-blue);
+    text-transform: uppercase;
+}
+.judge-call-order {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+.judge-call-meta {
+    font-size: 0.75rem;
+    color: var(--text-secondary);
+    margin-bottom: 8px;
+}
+.judge-call-response {
+    font-size: 0.75rem;
+    color: var(--text-secondary);
+    background: var(--bg-main);
+    padding: 10px;
+    border-radius: 4px;
+    white-space: pre-wrap;
+    word-break: break-word;
+    max-height: 200px;
+    overflow-y: auto;
+    line-height: 1.5;
+    font-family: var(--font-mono);
+}
+.battles-table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.8125rem;
+}
+.battles-table th,
+.battles-table td {
+    padding: 8px 12px;
+    text-align: left;
+    border-bottom: 1px solid var(--border-color);
+}
+.battles-table th {
+    background: var(--bg-card);
+    color: var(--text-primary);
+    font-weight: 600;
+}
+.battles-table td {
+    color: var(--text-secondary);
+}
+.battles-table tbody tr:hover {
+    background: var(--bg-hover);
+}
+.badge.result-win {
+    background: rgba(16, 185, 129, 0.2);
+    color: var(--accent-green);
+}
+.badge.result-loss {
+    background: rgba(239, 68, 68, 0.2);
+    color: var(--accent-red);
+}
+.badge.result-tie {
+    background: rgba(245, 158, 11, 0.2);
+    color: var(--accent-yellow);
+}
+/* ========== Responsive ========== */
+@media (max-width: 1024px) {
+    .sidebar {
+        width: 240px;
+    }
+    .battle-images {
+        grid-template-columns: repeat(2, 1fr);
+    }
+    .detail-images {
+        grid-template-columns: repeat(2, 1fr);
+    }
+}
+@media (max-width: 768px) {
+    .header {
+        flex-direction: column;
+        gap: 12px;
+        padding: 12px 16px;
+    }
+    .header-center {
+        flex-direction: column;
+        width: 100%;
+    }
+    .selector-group {
+        width: 100%;
+    }
+    .selector {
+        flex: 1;
+    }
+    .main-container {
+        flex-direction: column;
+    }
+    .sidebar {
+        width: 100%;
+        border-right: none;
+        border-bottom: 1px solid var(--border-color);
+    }
+    .battle-images,
+    .detail-images {
+        grid-template-columns: 1fr;
+    }
+}
+/* ========== Opponent Sections (Collapsible) ========== */
+.model-battles-hint {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+    margin-bottom: 12px;
+    font-style: italic;
+}
+.opponent-sections-container {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+}
+.opponent-section {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-md);
+    overflow: hidden;
+}
+.opponent-section.hidden {
+    display: none;
+}
+.opponent-section-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 12px 16px;
+    cursor: pointer;
+    transition: background-color 0.2s ease;
+    user-select: none;
+}
+.opponent-section-header:hover {
+    background: var(--bg-hover);
+}
+.opponent-section-info {
+    display: flex;
+    align-items: center;
+    gap: 16px;
+    flex-wrap: wrap;
+}
+.opponent-name {
+    font-weight: 600;
+    font-size: 0.9375rem;
+    color: var(--text-primary);
+}
+.opponent-name.result-win {
+    color: var(--accent-green);
+}
+.opponent-name.result-loss {
+    color: var(--accent-red);
+}
+.opponent-name.result-tie {
+    color: var(--accent-yellow);
+}
+.opponent-stats {
+    font-size: 0.8125rem;
+    color: var(--text-secondary);
+    font-family: var(--font-mono);
+}
+.opponent-stats .wins {
+    color: var(--accent-green);
+}
+.opponent-stats .losses {
+    color: var(--accent-red);
+}
+.opponent-stats .ties {
+    color: var(--accent-yellow);
+}
+.opponent-section .expand-icon {
+    color: var(--text-muted);
+    font-size: 0.875rem;
+    transition: transform 0.2s ease;
+}
+.opponent-section.expanded .expand-icon {
+    transform: rotate(180deg);
+}
+.opponent-section-content {
+    display: none;
+    padding: 0 16px 16px;
+    border-top: 1px solid var(--border-color);
+    background: var(--bg-secondary);
+}
+.opponent-section.expanded .opponent-section-content {
+    display: block;
+}
+/* Battle Record Item (inside opponent section) */
+.battle-record-item {
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-sm);
+    padding: 12px;
+    margin-top: 12px;
+}
+.battle-record-item:first-child {
+    margin-top: 16px;
+}
+.battle-record-item-header {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    flex-wrap: wrap;
+    margin-bottom: 12px;
+}
+.battle-exp-name {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+    background: var(--bg-primary);
+    padding: 2px 6px;
+    border-radius: var(--radius-sm);
+}
+/* Battle Judge Outputs (inside battle record item) */
+.battle-judge-outputs {
+    display: flex;
+    flex-direction: column;
+    gap: 12px;
+}
+.battle-judge-outputs .judge-call {
+    background: var(--bg-primary);
+    border-radius: var(--radius-sm);
+    padding: 12px;
+    margin-bottom: 0;
+}
+.battle-judge-outputs .placeholder {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+    font-style: italic;
+    padding: 8px;
+    text-align: center;
+}
+/* ========== Search Box ========== */
+.search-box {
+    display: flex;
+    align-items: center;
+    gap: 4px;
+}
+.search-input {
+    width: 200px;
+    padding: 6px 12px;
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    color: var(--text-primary);
+    font-size: 0.875rem;
+}
+.search-input:focus {
+    outline: none;
+    border-color: var(--accent-blue);
+}
+.search-input::placeholder {
+    color: var(--text-muted);
+}
+.search-btn, .clear-search-btn {
+    padding: 6px 10px;
+    min-width: auto;
+}
+.clear-search-btn {
+    color: var(--accent-red);
+}
+/* Search results highlight */
+.search-highlight {
+    background: rgba(88, 166, 255, 0.3);
+    padding: 0 2px;
+    border-radius: 2px;
+}
+/* ========== Compare View ========== */
+.compare-list {
+    display: flex;
+    flex-direction: column;
+    gap: 16px;
+}
+.compare-controls {
+    display: flex;
+    justify-content: center;
+    padding: 16px;
+    background: var(--bg-secondary);
+    border-radius: var(--radius-md);
+    border: 1px solid var(--border-color);
+}
+.compare-input-group {
+    display: flex;
+    align-items: center;
+    gap: 12px;
+}
+.compare-input-group label {
+    color: var(--text-secondary);
+    font-size: 0.875rem;
+}
+.compare-sample-input {
+    width: 120px;
+    padding: 8px 12px;
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    color: var(--text-primary);
+    font-size: 0.875rem;
+}
+.compare-content {
+    padding: 16px;
+}
+.compare-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(250px, 1fr));
+    gap: 16px;
+}
+.compare-model-card {
+    background: var(--bg-secondary);
+    border: 2px solid var(--border-color);
+    border-radius: var(--radius-md);
+    overflow: hidden;
+    transition: all 0.15s ease;
+    cursor: pointer;
+}
+.compare-model-card:hover {
+    border-color: var(--accent-blue);
+}
+.compare-model-card.rank-1 {
+    border-color: var(--accent-yellow);
+    box-shadow: 0 0 12px rgba(210, 153, 34, 0.3);
+}
+.compare-model-card.rank-2 {
+    border-color: #c0c0c0;
+}
+.compare-model-card.rank-3 {
+    border-color: #cd7f32;
+}
+.compare-model-card.selected {
+    border-color: var(--accent-purple);
+    box-shadow: 0 0 12px rgba(163, 113, 247, 0.3);
+}
+.compare-model-image {
+    aspect-ratio: 1;
+    width: 100%;
+    overflow: hidden;
+    background: var(--bg-primary);
+}
+.compare-model-image img {
+    width: 100%;
+    height: 100%;
+    object-fit: cover;
+}
+.compare-model-info {
+    padding: 12px;
+    border-top: 1px solid var(--border-color);
+}
+.compare-model-name {
+    font-family: var(--font-mono);
+    font-size: 0.875rem;
+    color: var(--text-primary);
+    margin-bottom: 4px;
+}
+.compare-model-stats {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+.compare-model-stats .win-rate {
+    color: var(--accent-green);
+    font-weight: 600;
+}
+/* Compare header info */
+.compare-header-info {
+    background: var(--bg-tertiary);
+    padding: 16px;
+    border-radius: var(--radius-md);
+    margin-bottom: 16px;
+}
+.compare-header-info h3 {
+    font-size: 1rem;
+    margin-bottom: 8px;
+}
+.compare-header-info .instruction {
+    color: var(--text-secondary);
+    font-size: 0.875rem;
+    line-height: 1.5;
+}
+/* ========== ELO Actions ========== */
+.elo-actions {
+    display: flex;
+    gap: 4px;
+    margin-top: 12px;
+    flex-wrap: wrap;
+}
+.elo-actions .btn {
+    flex: 1;
+    min-width: 60px;
+    font-size: 0.6875rem;
+    padding: 4px 6px;
+}
+/* ========== Multi-Subset Modal ========== */
+.multi-subset-header {
+    padding: 20px;
+    border-bottom: 1px solid var(--border-color);
+}
+.multi-subset-header h2 {
+    margin: 0;
+    font-size: 1.25rem;
+}
+.multi-subset-body {
+    padding: 20px;
+}
+.multi-subset-selection {
+    margin-bottom: 20px;
+}
+.multi-subset-selection h4 {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+    margin-bottom: 12px;
+}
+.multi-subset-info {
+    background: var(--bg-tertiary);
+    padding: 12px 16px;
+    border-radius: var(--radius-sm);
+    margin-bottom: 16px;
+}
+.multi-subset-info p {
+    margin: 4px 0;
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+}
+.multi-subset-info span {
+    font-family: var(--font-mono);
+    color: var(--accent-blue);
+}
+.multi-subset-options {
+    margin-bottom: 16px;
+    display: flex;
+    align-items: center;
+    gap: 16px;
+}
+.multi-subset-options label:first-child {
+    color: var(--text-secondary);
+    font-size: 0.875rem;
+}
+.radio-label {
+    display: flex;
+    align-items: center;
+    gap: 4px;
+    font-size: 0.875rem;
+    color: var(--text-primary);
+    cursor: pointer;
+}
+.radio-label input[type="radio"] {
+    accent-color: var(--accent-blue);
+}
+.multi-subset-results {
+    border-top: 1px solid var(--border-color);
+    padding: 20px;
+    margin-top: 20px;
+}
+.multi-subset-results h3 {
+    font-size: 0.875rem;
+    color: var(--text-muted);
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    margin-bottom: 16px;
+}
+/* Subset tag for showing model presence */
+.subset-tag {
+    display: inline-block;
+    font-size: 0.5625rem;
+    padding: 2px 4px;
+    background: var(--bg-primary);
+    border-radius: 2px;
+    margin-left: 4px;
+    color: var(--text-muted);
+}
+/* ========== Win Rate Matrix Modal ========== */
+.matrix-modal-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 20px;
+    border-bottom: 1px solid var(--border-color);
+}
+.matrix-modal-header h2 {
+    margin: 0;
+    font-size: 1.25rem;
+}
+.matrix-content {
+    padding: 20px;
+    overflow-x: auto;
+}
+.matrix-table-container {
+    overflow-x: auto;
+    max-width: 100%;
+}
+.matrix-table {
+    border-collapse: collapse;
+    font-size: 0.75rem;
+}
+.matrix-table th,
+.matrix-table td {
+    padding: 8px;
+    text-align: center;
+    border: 1px solid var(--border-color);
+    min-width: 60px;
+}
+.matrix-table th {
+    background: var(--bg-tertiary);
+    color: var(--text-secondary);
+    font-weight: 600;
+    position: sticky;
+}
+.matrix-table th:first-child {
+    left: 0;
+    z-index: 2;
+}
+.matrix-table thead th {
+    top: 0;
+    z-index: 1;
+}
+.matrix-table thead th:first-child {
+    z-index: 3;
+}
+.matrix-table tbody th {
+    text-align: right;
+    background: var(--bg-tertiary);
+    left: 0;
+    z-index: 1;
+}
+.matrix-cell {
+    font-family: var(--font-mono);
+    font-size: 0.6875rem;
+    cursor: pointer;
+    transition: all 0.15s ease;
+}
+.matrix-cell:hover {
+    transform: scale(1.1);
+    z-index: 5;
+    position: relative;
+}
+.matrix-cell-diagonal {
+    background: var(--bg-primary) !important;
+    color: var(--text-muted);
+}
+/* Matrix tooltip */
+.matrix-tooltip {
+    position: absolute;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    padding: 8px 12px;
+    font-size: 0.75rem;
+    color: var(--text-primary);
+    z-index: 100;
+    pointer-events: none;
+    box-shadow: var(--shadow-md);
+    white-space: nowrap;
+}
+/* ========== ELO History Modal ========== */
+.elo-history-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 20px;
+    border-bottom: 1px solid var(--border-color);
+}
+.elo-history-header h2 {
+    margin: 0;
+    font-size: 1.25rem;
+}
+.elo-history-controls {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.elo-history-controls label {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+}
+.elo-history-content {
+    padding: 20px;
+    min-height: 400px;
+}
+.elo-history-chart {
+    width: 100%;
+    height: 400px;
+}
+.elo-history-legend {
+    padding: 0 20px 20px;
+    display: flex;
+    flex-wrap: wrap;
+    gap: 12px;
+    justify-content: center;
+}
+.legend-item {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    font-size: 0.75rem;
+    cursor: pointer;
+    padding: 4px 8px;
+    border-radius: var(--radius-sm);
+    background: var(--bg-tertiary);
+    transition: all 0.15s ease;
+}
+.legend-item:hover {
+    background: var(--bg-hover);
+}
+.legend-item.hidden {
+    opacity: 0.4;
+}
+.legend-color {
+    width: 12px;
+    height: 3px;
+    border-radius: 1px;
+}
+.legend-label {
+    color: var(--text-secondary);
+}
+/* ========== ELO by Source Modal ========== */
+.elo-by-source-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 20px;
+    border-bottom: 1px solid var(--border-color);
+}
+.elo-by-source-header h2 {
+    margin: 0;
+    font-size: 1.25rem;
+}
+.elo-by-source-content {
+    padding: 20px;
+    max-height: calc(90vh - 100px);
+    overflow-y: auto;
+}
+.source-section {
+    margin-bottom: 24px;
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-md);
+    overflow: hidden;
+}
+.source-section-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 12px 16px;
+    background: var(--bg-secondary);
+    cursor: pointer;
+}
+.source-section-header:hover {
+    background: var(--bg-hover);
+}
+.source-name {
+    font-weight: 600;
+    color: var(--text-primary);
+}
+.source-stats {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+.source-leaderboard {
+    padding: 12px 16px;
+}
+.source-leaderboard-item {
+    display: flex;
+    align-items: center;
+    gap: 12px;
+    padding: 6px 0;
+    border-bottom: 1px solid var(--border-color);
+}
+.source-leaderboard-item:last-child {
+    border-bottom: none;
+}
+.source-rank {
+    font-weight: 600;
+    min-width: 24px;
+    text-align: center;
+    color: var(--text-muted);
+}
+.source-rank.rank-1 { color: var(--accent-yellow); }
+.source-rank.rank-2 { color: #c0c0c0; }
+.source-rank.rank-3 { color: #cd7f32; }
+.source-model-name {
+    flex: 1;
+    font-family: var(--font-mono);
+    font-size: 0.8125rem;
+}
+.source-elo {
+    font-family: var(--font-mono);
+    font-size: 0.8125rem;
+    color: var(--accent-blue);
+    min-width: 50px;
+    text-align: right;
+}
+/* ========== Missing CSS Variables Fix ========== */
+/* These were referenced but not defined - redeclare them properly */
+/* ========== Win Rate Matrix Styles ========== */
+.matrix-scroll-container {
+    overflow-x: auto;
+    max-width: 100%;
+    padding-bottom: 10px;
+}
+.win-rate-matrix {
+    border-collapse: collapse;
+    font-size: 0.6875rem;
+    margin: 0 auto;
+}
+.win-rate-matrix th,
+.win-rate-matrix td {
+    padding: 4px 6px;
+    text-align: center;
+    border: 1px solid var(--border-color);
+    min-width: 50px;
+    max-width: 80px;
+}
+.win-rate-matrix .matrix-corner {
+    background: var(--bg-primary);
+    position: sticky;
+    left: 0;
+    top: 0;
+    z-index: 3;
+}
+.win-rate-matrix .matrix-header-cell {
+    background: var(--bg-tertiary);
+    color: var(--text-secondary);
+    font-weight: 600;
+    font-size: 0.625rem;
+    writing-mode: vertical-rl;
+    text-orientation: mixed;
+    padding: 8px 4px;
+    max-height: 100px;
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+}
+.win-rate-matrix .matrix-row-header {
+    background: var(--bg-tertiary);
+    color: var(--text-secondary);
+    font-weight: 600;
+    font-size: 0.625rem;
+    text-align: right;
+    padding-right: 8px;
+    position: sticky;
+    left: 0;
+    z-index: 1;
+    white-space: nowrap;
+    max-width: none;
+}
+.win-rate-matrix .matrix-cell {
+    font-family: var(--font-mono);
+    font-size: 0.625rem;
+    cursor: default;
+    transition: transform 0.1s ease;
+}
+.win-rate-matrix .matrix-cell:hover {
+    transform: scale(1.15);
+    position: relative;
+    z-index: 2;
+    box-shadow: 0 0 8px rgba(0,0,0,0.5);
+}
+.win-rate-matrix .matrix-diagonal {
+    background: var(--bg-primary) !important;
+    color: var(--text-muted);
+}
+.win-rate-matrix .matrix-no-data {
+    background: var(--bg-primary) !important;
+    color: var(--text-muted);
+}
+.matrix-legend {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    gap: 12px;
+    margin-top: 16px;
+    padding: 12px;
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-sm);
+}
+.matrix-legend-label {
+    font-size: 0.75rem;
+    color: var(--text-secondary);
+}
+.matrix-legend-gradient {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.matrix-legend-gradient .legend-bar {
+    width: 100px;
+    height: 16px;
+    background: linear-gradient(to right, rgb(255, 55, 55), rgb(255, 255, 255), rgb(102, 200, 102));
+    border-radius: 2px;
+    border: 1px solid var(--border-color);
+}
+.matrix-legend-gradient .legend-low,
+.matrix-legend-gradient .legend-high {
+    font-size: 0.6875rem;
+    color: var(--text-muted);
+}
+/* ========== ELO History SVG Styles ========== */
+.elo-history-chart {
+    font-family: var(--font-sans);
+}
+.elo-history-chart .axis-line {
+    stroke: var(--border-color);
+    stroke-width: 1;
+}
+.elo-history-chart .grid-line {
+    stroke: var(--border-color);
+    stroke-width: 0.5;
+    stroke-dasharray: 4 4;
+    opacity: 0.5;
+}
+.elo-history-chart .axis-label {
+    font-size: 10px;
+    fill: var(--text-muted);
+}
+.elo-history-chart .axis-title {
+    font-size: 11px;
+    fill: var(--text-secondary);
+    font-weight: 500;
+}
+.elo-history-chart .elo-line {
+    stroke-linecap: round;
+    stroke-linejoin: round;
+}
+.elo-history-chart .elo-line:hover {
+    stroke-width: 4;
+}
+.elo-history-chart .elo-point {
+    cursor: pointer;
+    opacity: 0;
+    transition: opacity 0.15s ease, r 0.15s ease;
+}
+.elo-history-chart:hover .elo-point {
+    opacity: 1;
+}
+.elo-history-chart .elo-point:hover {
+    r: 6;
+    stroke: var(--text-primary);
+    stroke-width: 2;
+}
+.elo-history-chart-container {
+    position: relative;
+}
+.elo-tooltip {
+    position: absolute;
+    background: var(--bg-primary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    padding: 8px 12px;
+    font-size: 0.75rem;
+    line-height: 1.4;
+    pointer-events: none;
+    opacity: 0;
+    transition: opacity 0.15s ease;
+    z-index: 100;
+    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
+    white-space: nowrap;
+}
+.elo-tooltip.visible {
+    opacity: 1;
+}
+.elo-tooltip strong {
+    color: var(--text-primary);
+}
+/* Legend hidden state */
+.legend-item.hidden-model {
+    opacity: 0.4;
+}
+.legend-item.hidden-model .legend-color {
+    background: var(--text-muted) !important;
+}
+/* ========== ELO by Source Expanded Styles ========== */
+.source-sections-container {
+    display: flex;
+    flex-direction: column;
+    gap: 12px;
+}
+.source-section {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-md);
+    overflow: hidden;
+}
+.source-section-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 12px 16px;
+    cursor: pointer;
+    user-select: none;
+    transition: background 0.15s ease;
+}
+.source-section-header:hover {
+    background: var(--bg-hover);
+}
+.source-section-info {
+    display: flex;
+    align-items: center;
+    gap: 12px;
+}
+.source-name {
+    font-weight: 600;
+    color: var(--text-primary);
+}
+.source-battles {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+.source-section .expand-icon {
+    color: var(--text-muted);
+    font-size: 0.75rem;
+    transition: transform 0.2s ease;
+}
+.source-section.expanded .expand-icon {
+    transform: rotate(180deg);
+}
+.source-section-content {
+    display: none;
+    padding: 12px 16px;
+    border-top: 1px solid var(--border-color);
+}
+.source-section.expanded .source-section-content {
+    display: block;
+}
+.source-leaderboard {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.8125rem;
+}
+.source-leaderboard th,
+.source-leaderboard td {
+    padding: 8px 12px;
+    text-align: left;
+    border-bottom: 1px solid var(--border-color);
+}
+.source-leaderboard th {
+    background: var(--bg-tertiary);
+    color: var(--text-secondary);
+    font-weight: 600;
+    font-size: 0.75rem;
+}
+.source-leaderboard tbody tr:hover {
+    background: var(--bg-hover);
+}
+.source-leaderboard .rank-cell {
+    font-weight: 600;
+    color: var(--text-muted);
+}
+.source-leaderboard .rank-cell.rank-1 { color: var(--accent-yellow); }
+.source-leaderboard .rank-cell.rank-2 { color: #c0c0c0; }
+.source-leaderboard .rank-cell.rank-3 { color: #cd7f32; }
+.source-leaderboard .model-cell {
+    font-family: var(--font-mono);
+    font-size: 0.75rem;
+}
+.source-leaderboard .elo-cell {
+    font-family: var(--font-mono);
+    color: var(--accent-blue);
+}
+.source-leaderboard .wins {
+    color: var(--accent-green);
+}
+.source-leaderboard .losses {
+    color: var(--accent-red);
+}
+.source-leaderboard .ties {
+    color: var(--accent-yellow);
+}
+.source-leaderboard .win-rate-cell {
+    font-family: var(--font-mono);
+    color: var(--accent-green);
+}
+/* ========== Merged ELO Results ========== */
+.merged-elo-info {
+    background: var(--bg-tertiary);
+    padding: 12px 16px;
+    border-radius: var(--radius-sm);
+    margin-bottom: 16px;
+}
+.merged-elo-info p {
+    margin: 4px 0;
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+}
+.merged-leaderboard {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.8125rem;
+}
+.merged-leaderboard th,
+.merged-leaderboard td {
+    padding: 8px 12px;
+    text-align: left;
+    border-bottom: 1px solid var(--border-color);
+}
+.merged-leaderboard th {
+    background: var(--bg-tertiary);
+    color: var(--text-secondary);
+    font-weight: 600;
+    font-size: 0.75rem;
+}
+.merged-leaderboard tbody tr:hover {
+    background: var(--bg-hover);
+}
+.merged-leaderboard .rank-cell {
+    font-weight: 600;
+    color: var(--text-muted);
+}
+.merged-leaderboard .rank-cell.rank-1 { color: var(--accent-yellow); }
+.merged-leaderboard .rank-cell.rank-2 { color: #c0c0c0; }
+.merged-leaderboard .rank-cell.rank-3 { color: #cd7f32; }
+.merged-leaderboard .model-cell {
+    font-family: var(--font-mono);
+    font-size: 0.75rem;
+}
+.merged-leaderboard .elo-cell {
+    font-family: var(--font-mono);
+    color: var(--accent-blue);
+}
+.merged-leaderboard .wins {
+    color: var(--accent-green);
+}
+.merged-leaderboard .losses {
+    color: var(--accent-red);
+}
+.merged-leaderboard .stat-cell.ties,
+.merged-leaderboard .ties {
+    color: var(--accent-yellow);
+}
+.merged-leaderboard .win-rate-cell {
+    font-family: var(--font-mono);
+    color: var(--accent-green);
+}
+/* ========== Header Navigation ========== */
+.header-nav {
+    display: flex;
+    align-items: center;
+    gap: 24px;
+}
+.nav-link {
+    color: var(--text-secondary);
+    text-decoration: none;
+    font-size: 1rem;
+    cursor: pointer;
+    transition: color 0.15s ease;
+}
+.nav-link:hover {
+    color: var(--text-primary);
+}
+.nav-link.active {
+    color: var(--accent-blue);
+}
+.nav-separator {
+    color: var(--border-light);
+    font-size: 0.875rem;
+    user-select: none;
+}
+.nav-link.nav-external {
+    font-size: 0.875rem;
+    display: flex;
+    align-items: center;
+    gap: 4px;
+}
+.nav-link.nav-external .external-icon {
+    font-size: 0.75rem;
+    opacity: 0.7;
+}
+.nav-link.nav-external:hover .external-icon {
+    opacity: 1;
+}
+/* Header Action Buttons (unified style) */
+.btn-header-action {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    padding: 6px 12px;
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    color: var(--accent-yellow);
+    font-size: 0.875rem;
+    cursor: pointer;
+    transition: all 0.15s ease;
+}
+.btn-header-action:hover {
+    background: var(--bg-hover);
+    border-color: var(--accent-yellow);
+}
+.header-action-icon {
+    font-size: 1rem;
+}
+.header-action-count {
+    font-size: 0.75rem;
+    font-family: var(--font-mono);
+    background: var(--bg-primary);
+    padding: 2px 6px;
+    border-radius: var(--radius-sm);
+    color: var(--text-secondary);
+}
+/* Logo clickable */
+.logo {
+    cursor: pointer;
+    transition: color 0.15s ease;
+}
+.logo:hover {
+    color: var(--text-primary);
+}
+/* ========== Full Page Layout (no sidebar) ========== */
+.full-page {
+    flex: 1;
+    padding: 32px 48px;
+    overflow-y: auto;
+    max-width: 1600px;
+    margin: 0 auto;
+    width: 100%;
+}
+.page-header {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    margin-bottom: 12px;
+}
+.page-header h2 {
+    font-size: 1.5rem;
+    font-weight: 600;
+    color: var(--text-primary);
+    margin: 0;
+}
+.page-description {
+    font-size: 0.9375rem;
+    color: var(--text-secondary);
+    line-height: 1.5;
+    margin-bottom: 24px;
+}
+/* Gallery Page Container (with sidebar) */
+.gallery-page-container {
+    display: flex;
+    width: 100%;
+}
+/* Gallery Controls (top bar in content area) */
+.gallery-controls {
+    display: flex;
+    align-items: center;
+    gap: 16px;
+    padding: 12px 16px;
+    background: var(--bg-tertiary);
+    border-bottom: 1px solid var(--border-color);
+    flex-wrap: wrap;
+    margin-bottom: 16px;
+}
+.gallery-controls .selector-group {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.gallery-controls .selector-group label {
+    font-size: 0.8125rem;
+    color: var(--text-secondary);
+}
+.gallery-controls .selector {
+    min-width: 140px;
+}
+.gallery-controls .view-toggle {
+    margin-left: auto;
+}
+.gallery-controls .search-box {
+    display: flex;
+    align-items: center;
+    gap: 4px;
+}
+/* ========== Overview Page Styles ========== */
+.overview-content {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-md);
+    overflow: hidden;
+}
+.overview-table-container {
+    overflow-x: auto;
+    max-width: 100%;
+}
+.overview-table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.875rem;
+}
+.overview-table th,
+.overview-table td {
+    padding: 12px 16px;
+    text-align: left;
+    border-bottom: 1px solid var(--border-color);
+    white-space: nowrap;
+}
+.overview-table th {
+    background: var(--bg-tertiary);
+    color: var(--text-secondary);
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    font-weight: 600;
+    position: sticky;
+    top: 0;
+    z-index: 10;
+}
+.overview-table th.sortable {
+    cursor: pointer;
+    user-select: none;
+    transition: background 0.15s ease;
+}
+.overview-table th.sortable:hover {
+    background: var(--bg-hover);
+    color: var(--text-primary);
+}
+.overview-table th.sorted-asc::after {
+    content: ' ▲';
+    font-size: 0.625rem;
+}
+.overview-table th.sorted-desc::after {
+    content: ' ▼';
+    font-size: 0.625rem;
+}
+.overview-table th.subset-header {
+    cursor: pointer;
+    transition: all 0.15s ease;
+}
+.overview-table th.subset-header:hover {
+    background: var(--accent-blue);
+    color: #fff;
+}
+.overview-table th.model-header {
+    position: sticky;
+    left: 0;
+    z-index: 11;
+    background: var(--bg-tertiary);
+}
+.overview-table td.model-cell {
+    font-family: var(--font-mono);
+    font-weight: 500;
+    position: sticky;
+    left: 0;
+    background: var(--bg-secondary);
+    z-index: 1;
+    cursor: pointer;
+    transition: background 0.15s ease;
+}
+.overview-table tr:hover td.model-cell {
+    background: var(--bg-tertiary);
+}
+.overview-table td.model-cell:hover {
+    color: var(--accent-blue);
+}
+.overview-table td.elo-cell {
+    font-family: var(--font-mono);
+    text-align: center;
+}
+.overview-table td.elo-cell.no-data {
+    color: var(--text-muted);
+}
+.overview-table td.avg-elo-cell {
+    font-family: var(--font-mono);
+    font-weight: 600;
+    color: var(--accent-blue);
+    text-align: center;
+    background: var(--bg-tertiary);
+}
+.overview-table tbody tr {
+    transition: background 0.15s ease;
+}
+.overview-table tbody tr:hover {
+    background: var(--bg-hover);
+}
+/* ELO value coloring */
+.elo-high {
+    color: var(--accent-green);
+}
+.elo-mid {
+    color: var(--text-primary);
+}
+.elo-low {
+    color: var(--accent-red);
+}
+/* Rank badge in overview */
+.rank-badge {
+    display: inline-block;
+    width: 24px;
+    height: 24px;
+    line-height: 24px;
+    text-align: center;
+    border-radius: 50%;
+    font-size: 0.6875rem;
+    font-weight: 600;
+    margin-right: 8px;
+}
+.rank-badge.rank-1 {
+    background: rgba(210, 153, 34, 0.2);
+    color: var(--accent-yellow);
+}
+.rank-badge.rank-2 {
+    background: rgba(192, 192, 192, 0.2);
+    color: #c0c0c0;
+}
+.rank-badge.rank-3 {
+    background: rgba(205, 127, 50, 0.2);
+    color: #cd7f32;
+}
+/* Subset info in header */
+.subset-header-info {
+    display: block;
+    font-size: 0.625rem;
+    font-weight: 400;
+    color: var(--text-muted);
+    text-transform: none;
+    letter-spacing: normal;
+    margin-top: 2px;
+}
+/* ========== Cross-Subset Modal Styles ========== */
+#cross-subset-modal .modal-content {
+    padding: 32px;
+    max-width: 800px;
+    width: 90%;
+}
+.cross-subset-modal-header {
+    margin-bottom: 8px;
+}
+.cross-subset-modal-header h2 {
+    font-size: 1.25rem;
+    font-weight: 600;
+    color: var(--text-primary);
+    margin: 0;
+}
+.modal-description {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+    margin-bottom: 24px;
+}
+#cross-subset-modal .cross-subset-content {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-md);
+    padding: 24px;
+    margin-bottom: 24px;
+}
+#cross-subset-modal .cross-subset-info {
+    background: var(--bg-secondary);
+}
+#cross-subset-modal .cross-subset-results {
+    margin-top: 24px;
+}
+#cross-subset-modal .cross-subset-results:empty {
+    display: none;
+    margin-top: 0;
+}
+.cross-subset-selection {
+    margin-bottom: 20px;
+}
+.cross-subset-selection h4 {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+    margin-bottom: 12px;
+}
+.cross-subset-info {
+    background: var(--bg-tertiary);
+    padding: 12px 16px;
+    border-radius: var(--radius-sm);
+    margin-bottom: 16px;
+}
+.cross-subset-info p {
+    margin: 4px 0;
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+}
+.cross-subset-info span {
+    font-family: var(--font-mono);
+    color: var(--accent-blue);
+}
+.cross-subset-options {
+    margin-bottom: 16px;
+    display: flex;
+    align-items: center;
+    gap: 16px;
+}
+.cross-subset-options label:first-child {
+    color: var(--text-secondary);
+    font-size: 0.875rem;
+}
+.cross-subset-results {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-md);
+    padding: 24px;
+}
+.cross-subset-results:empty {
+    display: none;
+}
+.cross-subset-results h3 {
+    font-size: 0.875rem;
+    color: var(--text-muted);
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    margin-bottom: 16px;
+}
+/* ========== Responsive Adjustments ========== */
+@media (max-width: 1200px) {
+    .full-page {
+        padding: 24px 32px;
+    }
+}
+@media (max-width: 768px) {
+    .header-center {
+        flex-wrap: wrap;
+        gap: 8px;
+    }
+    .header-nav {
+        order: -1;
+        width: 100%;
+        justify-content: center;
+    }
+    .subset-controls {
+        flex-wrap: wrap;
+        justify-content: center;
+    }
+    .full-page {
+        padding: 16px;
+    }
+    .overview-table {
+        font-size: 0.75rem;
+    }
+    .overview-table th,
+    .overview-table td {
+        padding: 8px 10px;
+    }
+}

genarena/visualize/templates/index.html ADDED Viewed

	@@ -0,0 +1,413 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>GenArena Explorer</title>
+    <link rel="stylesheet" href="static/style.css">
+</head>
+<body>
+    <div id="app">
+        <!-- Header -->
+        <header class="header">
+            <div class="header-left">
+                <h1 class="logo" id="logo-link" title="Back to Overview">GenArena</h1>
+                <nav class="header-nav">
+                    <a id="nav-overview" class="nav-link active" title="All Subsets Overview">Overview</a>
+                    <a id="nav-gallery" class="nav-link" title="Browse Battles">Gallery</a>
+                    <span class="nav-separator">|</span>
+                    <a href="#" class="nav-link nav-external" target="_blank" rel="noopener noreferrer" title="Project Page">Project Page <span class="external-icon">↗</span></a>
+                    <a href="#" class="nav-link nav-external" target="_blank" rel="noopener noreferrer" title="arXiv Paper">arXiv <span class="external-icon">↗</span></a>
+                    <a href="#" class="nav-link nav-external" target="_blank" rel="noopener noreferrer" title="GitHub Repository">GitHub <span class="external-icon">↗</span></a>
+                </nav>
+            </div>
+            <div class="header-right">
+                <button id="favorites-btn" class="btn-header-action" title="View Favorites">
+                    <span class="header-action-icon">★</span>
+                    <span id="favorites-count" class="header-action-count">0</span>
+                </button>
+            </div>
+        </header>
+        <!-- Main Content -->
+        <div class="main-container">
+            <!-- Overview Page (default landing) -->
+            <div id="overview-page" class="full-page">
+                <div class="page-header">
+                    <h2>All Subsets Leaderboard</h2>
+                    <button id="cross-subset-btn" class="btn btn-secondary">Cross-Subset</button>
+                </div>
+                <p class="page-description">ELO rankings across all evaluation subsets. Click on a subset column header to view details, or click a model row to see its performance.</p>
+                <div id="overview-content" class="overview-content">
+                    <div class="loading">Loading leaderboards...</div>
+                </div>
+            </div>
+            <!-- Gallery Page (with sidebar) -->
+            <div id="gallery-page" class="gallery-page-container" style="display: none;">
+                <!-- Sidebar Filters -->
+                <aside class="sidebar">
+                <div class="filter-section">
+                    <h3>Filters</h3>
+                    <!-- Model filter - only visible in battles view -->
+                    <div class="filter-group battles-only">
+                        <label>Models: <span id="model-count">(0 selected)</span></label>
+                        <div id="model-checkboxes" class="checkbox-group">
+                            <!-- Populated by JavaScript -->
+                        </div>
+                        <div class="checkbox-actions">
+                            <button id="select-all-models" class="btn btn-small">Select All</button>
+                            <button id="clear-all-models" class="btn btn-small">Clear All</button>
+                        </div>
+                    </div>
+                    <div class="filter-group battles-only" id="result-filter-group" style="display: none;">
+                        <label for="result-filter">Result (single model only):</label>
+                        <select id="result-filter" class="filter-select">
+                            <option value="">All results</option>
+                            <option value="wins">Wins</option>
+                            <option value="losses">Losses</option>
+                            <option value="ties">Ties</option>
+                        </select>
+                    </div>
+                    <div class="filter-group battles-only">
+                        <label for="consistency-filter">Consistency:</label>
+                        <select id="consistency-filter" class="filter-select">
+                            <option value="">All</option>
+                            <option value="true">Consistent</option>
+                            <option value="false">Inconsistent</option>
+                        </select>
+                    </div>
+                    <div class="filter-group" id="prompt-source-filter-group" style="display: none;">
+                        <label for="prompt-source-filter">Prompt Source:</label>
+                        <select id="prompt-source-filter" class="filter-select">
+                            <option value="">All sources</option>
+                        </select>
+                    </div>
+                    <!-- Model filter for prompts view -->
+                    <div class="filter-group prompts-only" id="prompts-model-filter-group" style="display: none;">
+                        <label>Filter Models: <span id="prompts-model-count">(0 selected)</span></label>
+                        <div id="prompts-model-checkboxes" class="checkbox-group">
+                            <!-- Populated by JavaScript -->
+                        </div>
+                        <div class="checkbox-actions">
+                            <button id="prompts-select-all-models" class="btn btn-small">Select All</button>
+                            <button id="prompts-clear-all-models" class="btn btn-small">Clear All</button>
+                        </div>
+                        <button id="prompts-apply-model-filter" class="btn btn-primary btn-small" style="width: 100%; margin-top: 8px;">Apply Filter</button>
+                        <p class="filter-hint">Show only selected models and their battles</p>
+                    </div>
+                    <div class="filter-group" id="image-count-filter-group" style="display: none;">
+                        <label>Input Images: <span id="image-range-display">1-1</span></label>
+                        <div class="range-slider-container">
+                            <input type="range" id="min-images-slider" class="range-slider" min="1" max="10" value="1">
+                            <input type="range" id="max-images-slider" class="range-slider" min="1" max="10" value="10">
+                        </div>
+                        <div class="range-labels">
+                            <span id="min-images-label">1</span>
+                            <span id="max-images-label">10</span>
+                        </div>
+                    </div>
+                    <button id="apply-filters" class="btn btn-primary">Apply Filters</button>
+                    <button id="clear-filters" class="btn btn-secondary">Clear</button>
+                </div>
+                <div class="stats-section">
+                    <h3>Statistics</h3>
+                    <div id="stats-panel">
+                        <p class="placeholder">Select a subset and experiment</p>
+                    </div>
+                </div>
+                <div class="h2h-section" id="h2h-section" style="display: none;">
+                    <h3>Head-to-Head</h3>
+                    <div id="h2h-panel">
+                        <!-- Filled by JavaScript -->
+                    </div>
+                </div>
+                <!-- ELO Leaderboard Section -->
+                <div class="elo-section" id="elo-section">
+                    <div class="elo-header">
+                        <h3>ELO Leaderboard</h3>
+                        <button id="view-full-leaderboard" class="btn btn-small btn-link" title="View Full Leaderboard">
+                            View All →
+                        </button>
+                    </div>
+                    <div id="elo-panel">
+                        <p class="placeholder">Select a subset to view rankings</p>
+                    </div>
+                    <div class="elo-actions">
+                        <button id="view-matrix" class="btn btn-small btn-secondary" title="View Win Rate Matrix">Matrix</button>
+                        <button id="view-elo-history" class="btn btn-small btn-secondary" title="View ELO History">History</button>
+                        <button id="view-elo-by-source" class="btn btn-small btn-secondary" title="View ELO by Source">By Source</button>
+                    </div>
+                </div>
+            </aside>
+            <!-- Battle List -->
+            <main class="content">
+                <!-- Gallery Controls -->
+                <div class="gallery-controls">
+                    <div class="selector-group">
+                        <label for="subset-select">Subset:</label>
+                        <select id="subset-select" class="selector">
+                            <option value="">Select subset...</option>
+                        </select>
+                    </div>
+                    <div class="selector-group">
+                        <label for="exp-select">Experiment:</label>
+                        <select id="exp-select" class="selector" disabled>
+                            <option value="">Select experiment...</option>
+                        </select>
+                    </div>
+                    <div class="view-toggle">
+                        <button id="view-battles" class="view-btn active" title="View Battles">
+                            <span class="view-icon">⚔️</span>
+                            <span>Battles</span>
+                        </button>
+                        <button id="view-prompts" class="view-btn" title="View by Prompt">
+                            <span class="view-icon">📝</span>
+                            <span>Prompts</span>
+                        </button>
+                    </div>
+                    <div class="search-box">
+                        <input type="text" id="search-input" class="search-input" placeholder="Search prompts..." title="Search by instruction, task type, or metadata">
+                        <button id="search-btn" class="btn btn-small search-btn" title="Search">🔍</button>
+                        <button id="clear-search-btn" class="btn btn-small clear-search-btn" title="Clear search" style="display: none;">✕</button>
+                    </div>
+                </div>
+                <div class="content-header">
+                    <div id="pagination-info" class="pagination-info"></div>
+                    <div class="pagination-controls">
+                        <button id="first-page" class="btn btn-small" disabled>&laquo;</button>
+                        <button id="prev-page" class="btn btn-small" disabled>&lt;</button>
+                        <div id="page-numbers" class="page-numbers">
+                            <!-- Populated by JavaScript -->
+                        </div>
+                        <button id="next-page" class="btn btn-small" disabled>&gt;</button>
+                        <button id="last-page" class="btn btn-small" disabled>&raquo;</button>
+                        <div class="page-jump">
+                            <input type="number" id="page-input" class="page-input" min="1" placeholder="Page">
+                            <button id="page-go" class="btn btn-small">Go</button>
+                        </div>
+                    </div>
+                </div>
+                <div id="battle-list" class="battle-list">
+                    <div class="empty-state">
+                        <p>Select a subset and experiment to view battles</p>
+                    </div>
+                </div>
+                <!-- Prompts View Container -->
+                <div id="prompts-list" class="prompts-list" style="display: none;">
+                    <div class="empty-state">
+                        <p>Select a subset and experiment to view prompts</p>
+                    </div>
+                </div>
+                <div class="content-footer">
+                    <div class="pagination-controls">
+                        <button id="first-page-bottom" class="btn btn-small" disabled>&laquo;</button>
+                        <button id="prev-page-bottom" class="btn btn-small" disabled>&lt;</button>
+                        <div id="page-numbers-bottom" class="page-numbers">
+                            <!-- Populated by JavaScript -->
+                        </div>
+                        <button id="next-page-bottom" class="btn btn-small" disabled>&gt;</button>
+                        <button id="last-page-bottom" class="btn btn-small" disabled>&raquo;</button>
+                        <div class="page-jump">
+                            <input type="number" id="page-input-bottom" class="page-input" min="1" placeholder="Page">
+                            <button id="page-go-bottom" class="btn btn-small">Go</button>
+                        </div>
+                    </div>
+                </div>
+            </main>
+            </div> <!-- End of gallery-page -->
+        </div>
+        <!-- Detail Modal -->
+        <div id="detail-modal" class="modal hidden">
+            <div class="modal-backdrop"></div>
+            <div class="modal-content">
+                <button class="modal-close">&times;</button>
+                <div id="detail-content">
+                    <!-- Filled by JavaScript -->
+                </div>
+            </div>
+        </div>
+        <!-- Favorites Modal -->
+        <div id="favorites-modal" class="modal hidden">
+            <div class="modal-backdrop"></div>
+            <div class="modal-content modal-content-wide">
+                <button class="modal-close">&times;</button>
+                <div class="favorites-modal-header">
+                    <h2>Favorite Prompts</h2>
+                    <button id="clear-all-favorites" class="btn btn-secondary btn-small">Clear All</button>
+                </div>
+                <div id="favorites-scrollable" class="favorites-scrollable">
+                    <!-- Favorites Model Filter - horizontal layout -->
+                    <div class="favorites-model-filter" id="favorites-model-filter-group">
+                        <label>Filter Models:</label>
+                        <div id="favorites-model-checkboxes" class="checkbox-group-horizontal">
+                            <!-- Populated by JavaScript -->
+                        </div>
+                        <div class="filter-controls-row">
+                            <div class="checkbox-actions-inline">
+                                <button id="favorites-select-all-models" class="btn btn-small">Select All</button>
+                                <button id="favorites-clear-all-models" class="btn btn-small">Clear All</button>
+                                <button id="favorites-apply-model-filter" class="btn btn-primary btn-small">Apply Filter</button>
+                            </div>
+                            <div class="stats-scope-toggle">
+                                <label class="toggle-label">
+                                    <input type="checkbox" id="favorites-stats-scope-all">
+                                    <span class="toggle-text">Win rate includes all opponents</span>
+                                </label>
+                            </div>
+                        </div>
+                    </div>
+                    <div id="favorites-content">
+                        <!-- Filled by JavaScript -->
+                    </div>
+                </div>
+            </div>
+        </div>
+        <!-- Image Lightbox -->
+        <div id="lightbox" class="lightbox">
+            <button class="lightbox-close">&times;</button>
+            <img id="lightbox-img" src="" alt="Enlarged image">
+            <div id="lightbox-label" class="lightbox-label"></div>
+        </div>
+        <!-- ELO Leaderboard Modal -->
+        <div id="leaderboard-modal" class="modal hidden">
+            <div class="modal-backdrop"></div>
+            <div class="modal-content modal-content-wide">
+                <button class="modal-close">&times;</button>
+                <div class="leaderboard-modal-header">
+                    <h2>ELO Leaderboard</h2>
+                    <span id="leaderboard-subset-name" class="subset-badge"></span>
+                </div>
+                <div id="leaderboard-content">
+                    <!-- Filled by JavaScript -->
+                </div>
+            </div>
+        </div>
+        <!-- Model Stats Modal -->
+        <div id="model-stats-modal" class="modal hidden">
+            <div class="modal-backdrop"></div>
+            <div class="modal-content modal-content-wide">
+                <button class="modal-close">&times;</button>
+                <div id="model-stats-content">
+                    <!-- Filled by JavaScript -->
+                </div>
+            </div>
+        </div>
+        <!-- Win Rate Matrix Modal -->
+        <div id="matrix-modal" class="modal hidden">
+            <div class="modal-backdrop"></div>
+            <div class="modal-content modal-content-wide">
+                <button class="modal-close">&times;</button>
+                <div class="matrix-modal-header">
+                    <h2>Win Rate Matrix</h2>
+                    <span id="matrix-subset-name" class="subset-badge"></span>
+                </div>
+                <div id="matrix-content" class="matrix-content">
+                    <!-- Filled by JavaScript -->
+                </div>
+            </div>
+        </div>
+        <!-- ELO History Modal -->
+        <div id="elo-history-modal" class="modal hidden">
+            <div class="modal-backdrop"></div>
+            <div class="modal-content modal-content-wide">
+                <button class="modal-close">&times;</button>
+                <div class="elo-history-header">
+                    <h2>ELO History</h2>
+                    <div class="elo-history-controls">
+                        <label for="elo-history-granularity">Group by:</label>
+                        <select id="elo-history-granularity" class="selector">
+                            <option value="experiment" selected>Experiment</option>
+                            <option value="day">Day</option>
+                            <option value="week">Week</option>
+                        </select>
+                    </div>
+                </div>
+                <div id="elo-history-content" class="elo-history-content">
+                    <!-- Filled by JavaScript - SVG chart -->
+                </div>
+                <div id="elo-history-legend" class="elo-history-legend">
+                    <!-- Filled by JavaScript -->
+                </div>
+            </div>
+        </div>
+        <!-- ELO by Source Modal -->
+        <div id="elo-by-source-modal" class="modal hidden">
+            <div class="modal-backdrop"></div>
+            <div class="modal-content modal-content-wide">
+                <button class="modal-close">&times;</button>
+                <div class="elo-by-source-header">
+                    <h2>ELO Rankings by Prompt Source</h2>
+                    <span id="elo-by-source-subset-name" class="subset-badge"></span>
+                </div>
+                <div id="elo-by-source-content" class="elo-by-source-content">
+                    <!-- Filled by JavaScript -->
+                </div>
+            </div>
+        </div>
+        <!-- Cross-Subset Modal -->
+        <div id="cross-subset-modal" class="modal hidden">
+            <div class="modal-backdrop"></div>
+            <div class="modal-content modal-content-wide">
+                <button class="modal-close">&times;</button>
+                <div class="cross-subset-modal-header">
+                    <h2>Cross-Subset ELO Analysis</h2>
+                </div>
+                <p class="modal-description">Merge battles from multiple subsets to compute combined ELO rankings.</p>
+                <div class="cross-subset-content">
+                    <div class="cross-subset-selection">
+                        <h4>Select subsets to merge:</h4>
+                        <div id="cross-subset-checkboxes" class="checkbox-group">
+                            <!-- Populated by JavaScript -->
+                        </div>
+                        <div class="checkbox-actions">
+                            <button id="cross-subset-select-all" class="btn btn-small">Select All</button>
+                            <button id="cross-subset-clear-all" class="btn btn-small">Clear All</button>
+                        </div>
+                    </div>
+                    <div class="cross-subset-info">
+                        <p>Common models (in all selected): <span id="common-model-count">-</span></p>
+                        <p>Union models (in any selected): <span id="union-model-count">-</span></p>
+                        <p>Total battles: <span id="total-battles-count">-</span></p>
+                    </div>
+                    <div class="cross-subset-options">
+                        <label>Model scope:</label>
+                        <label class="radio-label"><input type="radio" name="model-scope" value="all" checked> All models</label>
+                        <label class="radio-label"><input type="radio" name="model-scope" value="common"> Common only</label>
+                    </div>
+                    <button id="calculate-merged-elo" class="btn btn-primary">Calculate Merged ELO</button>
+                </div>
+                <div id="cross-subset-results" class="cross-subset-results">
+                    <!-- Filled by JavaScript -->
+                </div>
+            </div>
+        </div>
+    </div>
+    <script src="static/app.js"></script>
+</body>
+</html>