Spaces:

representational-alignment
/

iclr2026-realign-challenge

Running

iclr2026-realign-challenge

File size: 5,514 Bytes

"""Thin client for the private Modal evaluation backend.

The actual evaluation pipeline (embedding extraction, CKA scoring) lives in
a private repository and is deployed as a Modal app.  This module calls the
deployed functions by name — no backend code is imported here.

Public configs (blue model registry) can still be controlled via HF Space env
vars for convenience.  Secret configs (red team registry, blue heldout images)
are loaded server-side from the Modal volume — never sent from here.

The backend must be deployed first:
    modal deploy scripts/modal_backend.py   # from the private eval-backend repo
"""
from __future__ import annotations

import json
import os
from pathlib import Path
from typing import Any, Iterable

from src.hackathon.validation import (
    BLUE_MODEL_REGISTRY_ENV,
    MODEL_REGISTRY_ENV,
)


MODAL_ENABLE_ENV = "HACKATHON_MODAL_ENABLE"
MODAL_APP_ENV = "HACKATHON_MODAL_APP"
MODAL_BATCH_SIZE_ENV = "HACKATHON_MODAL_BATCH_SIZE"
DEFAULT_MODAL_APP = "iclr2026-eval"
DEFAULT_BATCH_SIZE = 64


def _is_truthy(value: str | None) -> bool:
    if value is None:
        return False
    return value.strip().lower() in {"1", "true", "yes", "y", "on"}


def is_modal_enabled() -> bool:
    return _is_truthy(os.environ.get(MODAL_ENABLE_ENV))


def _get_batch_size() -> int:
    raw = os.environ.get(MODAL_BATCH_SIZE_ENV, "").strip()
    if raw:
        return int(raw)
    return DEFAULT_BATCH_SIZE


def _get_modal_function(function_name: str) -> Any:
    import modal

    app_name = os.environ.get(MODAL_APP_ENV, "").strip() or DEFAULT_MODAL_APP
    return modal.Function.from_name(app_name, function_name)


def _load_json_file(path: str) -> Any:
    """Load a JSON or JSONL file from a local path."""
    p = Path(path)
    if p.suffix == ".jsonl":
        lines = p.read_text().splitlines()
        return [json.loads(line) for line in lines if line.strip()]
    return json.loads(p.read_text())


def _load_blue_model_registry() -> list[dict[str, Any]] | None:
    """Load blue model registry from env var if set, else return None.

    When None is returned the backend loads its copy from the Modal volume.
    """
    path = os.environ.get(BLUE_MODEL_REGISTRY_ENV, "").strip()
    if not path:
        path = os.environ.get(MODEL_REGISTRY_ENV, "").strip()
    if not path:
        return None

    data = _load_json_file(path)
    if isinstance(data, dict):
        data = data.get("models", data)
    return data


def score_blue_with_pairwise(
    model_names: Iterable[str],
    *,
    submission_id: str | None = None,
    submitter: str | None = None,
    hf_link: str | None = None,
) -> tuple[float, list[dict[str, Any]]]:
    """Score a blue team submission via the deployed Modal backend.

    If HACKATHON_BLUE_MODEL_REGISTRY (or HACKATHON_MODEL_REGISTRY) is set,
    the registry is sent to the backend.  Otherwise the backend loads its
    own copy from the Modal volume.

    Blue heldout images are always loaded server-side (secret).
    When submission_id is provided, the backend saves the result to the
    Modal volume for crash recovery.
    """
    model_registry = _load_blue_model_registry()

    fn = _get_modal_function("score_blue_submission")
    result = fn.remote(
        model_names=list(model_names),
        model_registry=model_registry,
        batch_size=_get_batch_size(),
        submission_id=submission_id,
        submitter=submitter,
        hf_link=hf_link,
    )
    avg_cka = float(result.get("avg_cka", 0.0))
    return avg_cka, list(result.get("pairwise", []))


def score_red_with_pairwise(
    selected_stimuli: Iterable[dict[str, str] | str],
    *,
    submission_id: str | None = None,
    submitter: str | None = None,
    hf_link: str | None = None,
) -> tuple[float, list[dict[str, Any]]]:
    """Score a red team submission via the deployed Modal backend.

    The red team model registry is always loaded server-side from the
    Modal volume (secret — never sent from the public Space).
    When submission_id is provided, the backend saves the result to the
    Modal volume for crash recovery.
    """
    stimuli_list: list[dict[str, str]] = []
    for item in selected_stimuli:
        if isinstance(item, str):
            parts = item.split("::", 1)
            if len(parts) == 2:
                stimuli_list.append({"dataset_name": parts[0], "image_identifier": parts[1]})
            else:
                raise ValueError(f"Invalid stimulus key format: {item}")
        else:
            stimuli_list.append(dict(item))

    fn = _get_modal_function("score_red_submission")
    result = fn.remote(
        selected_stimuli=stimuli_list,
        batch_size=_get_batch_size(),
        submission_id=submission_id,
        submitter=submitter,
        hf_link=hf_link,
    )
    score = float(result.get("score", 0.0))
    return score, list(result.get("pairwise", []))


def fetch_volume_submissions(team: str | None = None) -> list[dict[str, Any]]:
    """Fetch submissions saved on the Modal volume.

    Used to sync submissions after a Space restart.
    """
    fn = _get_modal_function("list_submissions")
    return fn.remote(team=team)


def push_submissions_to_volume(submissions: list[dict[str, Any]]) -> dict[str, int]:
    """Push local submissions to the Modal volume.

    Used to backfill the volume after a Modal crash or volume wipe.
    Returns {"added": N, "skipped": M}.
    """
    fn = _get_modal_function("backfill_submissions")
    return fn.remote(submissions=submissions)