"""Thin client for the private Modal evaluation backend. The actual evaluation pipeline (embedding extraction, CKA scoring) lives in a private repository and is deployed as a Modal app. This module calls the deployed functions by name — no backend code is imported here. Public configs (blue model registry) can still be controlled via HF Space env vars for convenience. Secret configs (red team registry, blue heldout images) are loaded server-side from the Modal volume — never sent from here. The backend must be deployed first: modal deploy scripts/modal_backend.py # from the private eval-backend repo """ from __future__ import annotations import json import os from pathlib import Path from typing import Any, Iterable from src.hackathon.validation import ( BLUE_MODEL_REGISTRY_ENV, MODEL_REGISTRY_ENV, ) MODAL_ENABLE_ENV = "HACKATHON_MODAL_ENABLE" MODAL_APP_ENV = "HACKATHON_MODAL_APP" MODAL_BATCH_SIZE_ENV = "HACKATHON_MODAL_BATCH_SIZE" DEFAULT_MODAL_APP = "iclr2026-eval" DEFAULT_BATCH_SIZE = 64 def _is_truthy(value: str | None) -> bool: if value is None: return False return value.strip().lower() in {"1", "true", "yes", "y", "on"} def is_modal_enabled() -> bool: return _is_truthy(os.environ.get(MODAL_ENABLE_ENV)) def _get_batch_size() -> int: raw = os.environ.get(MODAL_BATCH_SIZE_ENV, "").strip() if raw: return int(raw) return DEFAULT_BATCH_SIZE def _get_modal_function(function_name: str) -> Any: import modal app_name = os.environ.get(MODAL_APP_ENV, "").strip() or DEFAULT_MODAL_APP return modal.Function.from_name(app_name, function_name) def _load_json_file(path: str) -> Any: """Load a JSON or JSONL file from a local path.""" p = Path(path) if p.suffix == ".jsonl": lines = p.read_text().splitlines() return [json.loads(line) for line in lines if line.strip()] return json.loads(p.read_text()) def _load_blue_model_registry() -> list[dict[str, Any]] | None: """Load blue model registry from env var if set, else return None. When None is returned the backend loads its copy from the Modal volume. """ path = os.environ.get(BLUE_MODEL_REGISTRY_ENV, "").strip() if not path: path = os.environ.get(MODEL_REGISTRY_ENV, "").strip() if not path: return None data = _load_json_file(path) if isinstance(data, dict): data = data.get("models", data) return data def score_blue_with_pairwise( model_names: Iterable[str], *, submission_id: str | None = None, submitter: str | None = None, hf_link: str | None = None, ) -> tuple[float, list[dict[str, Any]]]: """Score a blue team submission via the deployed Modal backend. If HACKATHON_BLUE_MODEL_REGISTRY (or HACKATHON_MODEL_REGISTRY) is set, the registry is sent to the backend. Otherwise the backend loads its own copy from the Modal volume. Blue heldout images are always loaded server-side (secret). When submission_id is provided, the backend saves the result to the Modal volume for crash recovery. """ model_registry = _load_blue_model_registry() fn = _get_modal_function("score_blue_submission") result = fn.remote( model_names=list(model_names), model_registry=model_registry, batch_size=_get_batch_size(), submission_id=submission_id, submitter=submitter, hf_link=hf_link, ) avg_cka = float(result.get("avg_cka", 0.0)) return avg_cka, list(result.get("pairwise", [])) def score_red_with_pairwise( selected_stimuli: Iterable[dict[str, str] | str], *, submission_id: str | None = None, submitter: str | None = None, hf_link: str | None = None, ) -> tuple[float, list[dict[str, Any]]]: """Score a red team submission via the deployed Modal backend. The red team model registry is always loaded server-side from the Modal volume (secret — never sent from the public Space). When submission_id is provided, the backend saves the result to the Modal volume for crash recovery. """ stimuli_list: list[dict[str, str]] = [] for item in selected_stimuli: if isinstance(item, str): parts = item.split("::", 1) if len(parts) == 2: stimuli_list.append({"dataset_name": parts[0], "image_identifier": parts[1]}) else: raise ValueError(f"Invalid stimulus key format: {item}") else: stimuli_list.append(dict(item)) fn = _get_modal_function("score_red_submission") result = fn.remote( selected_stimuli=stimuli_list, batch_size=_get_batch_size(), submission_id=submission_id, submitter=submitter, hf_link=hf_link, ) score = float(result.get("score", 0.0)) return score, list(result.get("pairwise", [])) def fetch_volume_submissions(team: str | None = None) -> list[dict[str, Any]]: """Fetch submissions saved on the Modal volume. Used to sync submissions after a Space restart. """ fn = _get_modal_function("list_submissions") return fn.remote(team=team) def push_submissions_to_volume(submissions: list[dict[str, Any]]) -> dict[str, int]: """Push local submissions to the Modal volume. Used to backfill the volume after a Modal crash or volume wipe. Returns {"added": N, "skipped": M}. """ fn = _get_modal_function("backfill_submissions") return fn.remote(submissions=submissions)