File size: 5,514 Bytes
d6c8a4f ebf9b32 d6c8a4f ebf9b32 d6c8a4f ebf9b32 d6c8a4f ebf9b32 d6c8a4f 159a782 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | """Thin client for the private Modal evaluation backend.
The actual evaluation pipeline (embedding extraction, CKA scoring) lives in
a private repository and is deployed as a Modal app. This module calls the
deployed functions by name — no backend code is imported here.
Public configs (blue model registry) can still be controlled via HF Space env
vars for convenience. Secret configs (red team registry, blue heldout images)
are loaded server-side from the Modal volume — never sent from here.
The backend must be deployed first:
modal deploy scripts/modal_backend.py # from the private eval-backend repo
"""
from __future__ import annotations
import json
import os
from pathlib import Path
from typing import Any, Iterable
from src.hackathon.validation import (
BLUE_MODEL_REGISTRY_ENV,
MODEL_REGISTRY_ENV,
)
MODAL_ENABLE_ENV = "HACKATHON_MODAL_ENABLE"
MODAL_APP_ENV = "HACKATHON_MODAL_APP"
MODAL_BATCH_SIZE_ENV = "HACKATHON_MODAL_BATCH_SIZE"
DEFAULT_MODAL_APP = "iclr2026-eval"
DEFAULT_BATCH_SIZE = 64
def _is_truthy(value: str | None) -> bool:
if value is None:
return False
return value.strip().lower() in {"1", "true", "yes", "y", "on"}
def is_modal_enabled() -> bool:
return _is_truthy(os.environ.get(MODAL_ENABLE_ENV))
def _get_batch_size() -> int:
raw = os.environ.get(MODAL_BATCH_SIZE_ENV, "").strip()
if raw:
return int(raw)
return DEFAULT_BATCH_SIZE
def _get_modal_function(function_name: str) -> Any:
import modal
app_name = os.environ.get(MODAL_APP_ENV, "").strip() or DEFAULT_MODAL_APP
return modal.Function.from_name(app_name, function_name)
def _load_json_file(path: str) -> Any:
"""Load a JSON or JSONL file from a local path."""
p = Path(path)
if p.suffix == ".jsonl":
lines = p.read_text().splitlines()
return [json.loads(line) for line in lines if line.strip()]
return json.loads(p.read_text())
def _load_blue_model_registry() -> list[dict[str, Any]] | None:
"""Load blue model registry from env var if set, else return None.
When None is returned the backend loads its copy from the Modal volume.
"""
path = os.environ.get(BLUE_MODEL_REGISTRY_ENV, "").strip()
if not path:
path = os.environ.get(MODEL_REGISTRY_ENV, "").strip()
if not path:
return None
data = _load_json_file(path)
if isinstance(data, dict):
data = data.get("models", data)
return data
def score_blue_with_pairwise(
model_names: Iterable[str],
*,
submission_id: str | None = None,
submitter: str | None = None,
hf_link: str | None = None,
) -> tuple[float, list[dict[str, Any]]]:
"""Score a blue team submission via the deployed Modal backend.
If HACKATHON_BLUE_MODEL_REGISTRY (or HACKATHON_MODEL_REGISTRY) is set,
the registry is sent to the backend. Otherwise the backend loads its
own copy from the Modal volume.
Blue heldout images are always loaded server-side (secret).
When submission_id is provided, the backend saves the result to the
Modal volume for crash recovery.
"""
model_registry = _load_blue_model_registry()
fn = _get_modal_function("score_blue_submission")
result = fn.remote(
model_names=list(model_names),
model_registry=model_registry,
batch_size=_get_batch_size(),
submission_id=submission_id,
submitter=submitter,
hf_link=hf_link,
)
avg_cka = float(result.get("avg_cka", 0.0))
return avg_cka, list(result.get("pairwise", []))
def score_red_with_pairwise(
selected_stimuli: Iterable[dict[str, str] | str],
*,
submission_id: str | None = None,
submitter: str | None = None,
hf_link: str | None = None,
) -> tuple[float, list[dict[str, Any]]]:
"""Score a red team submission via the deployed Modal backend.
The red team model registry is always loaded server-side from the
Modal volume (secret — never sent from the public Space).
When submission_id is provided, the backend saves the result to the
Modal volume for crash recovery.
"""
stimuli_list: list[dict[str, str]] = []
for item in selected_stimuli:
if isinstance(item, str):
parts = item.split("::", 1)
if len(parts) == 2:
stimuli_list.append({"dataset_name": parts[0], "image_identifier": parts[1]})
else:
raise ValueError(f"Invalid stimulus key format: {item}")
else:
stimuli_list.append(dict(item))
fn = _get_modal_function("score_red_submission")
result = fn.remote(
selected_stimuli=stimuli_list,
batch_size=_get_batch_size(),
submission_id=submission_id,
submitter=submitter,
hf_link=hf_link,
)
score = float(result.get("score", 0.0))
return score, list(result.get("pairwise", []))
def fetch_volume_submissions(team: str | None = None) -> list[dict[str, Any]]:
"""Fetch submissions saved on the Modal volume.
Used to sync submissions after a Space restart.
"""
fn = _get_modal_function("list_submissions")
return fn.remote(team=team)
def push_submissions_to_volume(submissions: list[dict[str, Any]]) -> dict[str, int]:
"""Push local submissions to the Modal volume.
Used to backfill the volume after a Modal crash or volume wipe.
Returns {"added": N, "skipped": M}.
"""
fn = _get_modal_function("backfill_submissions")
return fn.remote(submissions=submissions)
|