| """Thin client for the private Modal evaluation backend. |
| |
| The actual evaluation pipeline (embedding extraction, CKA scoring) lives in |
| a private repository and is deployed as a Modal app. This module calls the |
| deployed functions by name — no backend code is imported here. |
| |
| Public configs (blue model registry) can still be controlled via HF Space env |
| vars for convenience. Secret configs (red team registry, blue heldout images) |
| are loaded server-side from the Modal volume — never sent from here. |
| |
| The backend must be deployed first: |
| modal deploy scripts/modal_backend.py # from the private eval-backend repo |
| """ |
| from __future__ import annotations |
|
|
| import json |
| import os |
| from pathlib import Path |
| from typing import Any, Iterable |
|
|
| from src.hackathon.validation import ( |
| BLUE_MODEL_REGISTRY_ENV, |
| MODEL_REGISTRY_ENV, |
| ) |
|
|
|
|
| MODAL_ENABLE_ENV = "HACKATHON_MODAL_ENABLE" |
| MODAL_APP_ENV = "HACKATHON_MODAL_APP" |
| MODAL_BATCH_SIZE_ENV = "HACKATHON_MODAL_BATCH_SIZE" |
| DEFAULT_MODAL_APP = "iclr2026-eval" |
| DEFAULT_BATCH_SIZE = 64 |
|
|
|
|
| def _is_truthy(value: str | None) -> bool: |
| if value is None: |
| return False |
| return value.strip().lower() in {"1", "true", "yes", "y", "on"} |
|
|
|
|
| def is_modal_enabled() -> bool: |
| return _is_truthy(os.environ.get(MODAL_ENABLE_ENV)) |
|
|
|
|
| def _get_batch_size() -> int: |
| raw = os.environ.get(MODAL_BATCH_SIZE_ENV, "").strip() |
| if raw: |
| return int(raw) |
| return DEFAULT_BATCH_SIZE |
|
|
|
|
| def _get_modal_function(function_name: str) -> Any: |
| import modal |
|
|
| app_name = os.environ.get(MODAL_APP_ENV, "").strip() or DEFAULT_MODAL_APP |
| return modal.Function.from_name(app_name, function_name) |
|
|
|
|
| def _load_json_file(path: str) -> Any: |
| """Load a JSON or JSONL file from a local path.""" |
| p = Path(path) |
| if p.suffix == ".jsonl": |
| lines = p.read_text().splitlines() |
| return [json.loads(line) for line in lines if line.strip()] |
| return json.loads(p.read_text()) |
|
|
|
|
| def _load_blue_model_registry() -> list[dict[str, Any]] | None: |
| """Load blue model registry from env var if set, else return None. |
| |
| When None is returned the backend loads its copy from the Modal volume. |
| """ |
| path = os.environ.get(BLUE_MODEL_REGISTRY_ENV, "").strip() |
| if not path: |
| path = os.environ.get(MODEL_REGISTRY_ENV, "").strip() |
| if not path: |
| return None |
|
|
| data = _load_json_file(path) |
| if isinstance(data, dict): |
| data = data.get("models", data) |
| return data |
|
|
|
|
| def score_blue_with_pairwise( |
| model_names: Iterable[str], |
| *, |
| submission_id: str | None = None, |
| submitter: str | None = None, |
| hf_link: str | None = None, |
| ) -> tuple[float, list[dict[str, Any]]]: |
| """Score a blue team submission via the deployed Modal backend. |
| |
| If HACKATHON_BLUE_MODEL_REGISTRY (or HACKATHON_MODEL_REGISTRY) is set, |
| the registry is sent to the backend. Otherwise the backend loads its |
| own copy from the Modal volume. |
| |
| Blue heldout images are always loaded server-side (secret). |
| When submission_id is provided, the backend saves the result to the |
| Modal volume for crash recovery. |
| """ |
| model_registry = _load_blue_model_registry() |
|
|
| fn = _get_modal_function("score_blue_submission") |
| result = fn.remote( |
| model_names=list(model_names), |
| model_registry=model_registry, |
| batch_size=_get_batch_size(), |
| submission_id=submission_id, |
| submitter=submitter, |
| hf_link=hf_link, |
| ) |
| avg_cka = float(result.get("avg_cka", 0.0)) |
| return avg_cka, list(result.get("pairwise", [])) |
|
|
|
|
| def score_red_with_pairwise( |
| selected_stimuli: Iterable[dict[str, str] | str], |
| *, |
| submission_id: str | None = None, |
| submitter: str | None = None, |
| hf_link: str | None = None, |
| ) -> tuple[float, list[dict[str, Any]]]: |
| """Score a red team submission via the deployed Modal backend. |
| |
| The red team model registry is always loaded server-side from the |
| Modal volume (secret — never sent from the public Space). |
| When submission_id is provided, the backend saves the result to the |
| Modal volume for crash recovery. |
| """ |
| stimuli_list: list[dict[str, str]] = [] |
| for item in selected_stimuli: |
| if isinstance(item, str): |
| parts = item.split("::", 1) |
| if len(parts) == 2: |
| stimuli_list.append({"dataset_name": parts[0], "image_identifier": parts[1]}) |
| else: |
| raise ValueError(f"Invalid stimulus key format: {item}") |
| else: |
| stimuli_list.append(dict(item)) |
|
|
| fn = _get_modal_function("score_red_submission") |
| result = fn.remote( |
| selected_stimuli=stimuli_list, |
| batch_size=_get_batch_size(), |
| submission_id=submission_id, |
| submitter=submitter, |
| hf_link=hf_link, |
| ) |
| score = float(result.get("score", 0.0)) |
| return score, list(result.get("pairwise", [])) |
|
|
|
|
| def fetch_volume_submissions(team: str | None = None) -> list[dict[str, Any]]: |
| """Fetch submissions saved on the Modal volume. |
| |
| Used to sync submissions after a Space restart. |
| """ |
| fn = _get_modal_function("list_submissions") |
| return fn.remote(team=team) |
|
|
|
|
| def push_submissions_to_volume(submissions: list[dict[str, Any]]) -> dict[str, int]: |
| """Push local submissions to the Modal volume. |
| |
| Used to backfill the volume after a Modal crash or volume wipe. |
| Returns {"added": N, "skipped": M}. |
| """ |
| fn = _get_modal_function("backfill_submissions") |
| return fn.remote(submissions=submissions) |
|
|