File size: 5,514 Bytes
d6c8a4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebf9b32
d6c8a4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebf9b32
d6c8a4f
 
 
 
 
 
 
 
 
 
ebf9b32
d6c8a4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebf9b32
d6c8a4f
 
 
 
 
 
 
 
 
 
 
 
159a782
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""Thin client for the private Modal evaluation backend.

The actual evaluation pipeline (embedding extraction, CKA scoring) lives in
a private repository and is deployed as a Modal app.  This module calls the
deployed functions by name — no backend code is imported here.

Public configs (blue model registry) can still be controlled via HF Space env
vars for convenience.  Secret configs (red team registry, blue heldout images)
are loaded server-side from the Modal volume — never sent from here.

The backend must be deployed first:
    modal deploy scripts/modal_backend.py   # from the private eval-backend repo
"""
from __future__ import annotations

import json
import os
from pathlib import Path
from typing import Any, Iterable

from src.hackathon.validation import (
    BLUE_MODEL_REGISTRY_ENV,
    MODEL_REGISTRY_ENV,
)


MODAL_ENABLE_ENV = "HACKATHON_MODAL_ENABLE"
MODAL_APP_ENV = "HACKATHON_MODAL_APP"
MODAL_BATCH_SIZE_ENV = "HACKATHON_MODAL_BATCH_SIZE"
DEFAULT_MODAL_APP = "iclr2026-eval"
DEFAULT_BATCH_SIZE = 64


def _is_truthy(value: str | None) -> bool:
    if value is None:
        return False
    return value.strip().lower() in {"1", "true", "yes", "y", "on"}


def is_modal_enabled() -> bool:
    return _is_truthy(os.environ.get(MODAL_ENABLE_ENV))


def _get_batch_size() -> int:
    raw = os.environ.get(MODAL_BATCH_SIZE_ENV, "").strip()
    if raw:
        return int(raw)
    return DEFAULT_BATCH_SIZE


def _get_modal_function(function_name: str) -> Any:
    import modal

    app_name = os.environ.get(MODAL_APP_ENV, "").strip() or DEFAULT_MODAL_APP
    return modal.Function.from_name(app_name, function_name)


def _load_json_file(path: str) -> Any:
    """Load a JSON or JSONL file from a local path."""
    p = Path(path)
    if p.suffix == ".jsonl":
        lines = p.read_text().splitlines()
        return [json.loads(line) for line in lines if line.strip()]
    return json.loads(p.read_text())


def _load_blue_model_registry() -> list[dict[str, Any]] | None:
    """Load blue model registry from env var if set, else return None.

    When None is returned the backend loads its copy from the Modal volume.
    """
    path = os.environ.get(BLUE_MODEL_REGISTRY_ENV, "").strip()
    if not path:
        path = os.environ.get(MODEL_REGISTRY_ENV, "").strip()
    if not path:
        return None

    data = _load_json_file(path)
    if isinstance(data, dict):
        data = data.get("models", data)
    return data


def score_blue_with_pairwise(
    model_names: Iterable[str],
    *,
    submission_id: str | None = None,
    submitter: str | None = None,
    hf_link: str | None = None,
) -> tuple[float, list[dict[str, Any]]]:
    """Score a blue team submission via the deployed Modal backend.

    If HACKATHON_BLUE_MODEL_REGISTRY (or HACKATHON_MODEL_REGISTRY) is set,
    the registry is sent to the backend.  Otherwise the backend loads its
    own copy from the Modal volume.

    Blue heldout images are always loaded server-side (secret).
    When submission_id is provided, the backend saves the result to the
    Modal volume for crash recovery.
    """
    model_registry = _load_blue_model_registry()

    fn = _get_modal_function("score_blue_submission")
    result = fn.remote(
        model_names=list(model_names),
        model_registry=model_registry,
        batch_size=_get_batch_size(),
        submission_id=submission_id,
        submitter=submitter,
        hf_link=hf_link,
    )
    avg_cka = float(result.get("avg_cka", 0.0))
    return avg_cka, list(result.get("pairwise", []))


def score_red_with_pairwise(
    selected_stimuli: Iterable[dict[str, str] | str],
    *,
    submission_id: str | None = None,
    submitter: str | None = None,
    hf_link: str | None = None,
) -> tuple[float, list[dict[str, Any]]]:
    """Score a red team submission via the deployed Modal backend.

    The red team model registry is always loaded server-side from the
    Modal volume (secret — never sent from the public Space).
    When submission_id is provided, the backend saves the result to the
    Modal volume for crash recovery.
    """
    stimuli_list: list[dict[str, str]] = []
    for item in selected_stimuli:
        if isinstance(item, str):
            parts = item.split("::", 1)
            if len(parts) == 2:
                stimuli_list.append({"dataset_name": parts[0], "image_identifier": parts[1]})
            else:
                raise ValueError(f"Invalid stimulus key format: {item}")
        else:
            stimuli_list.append(dict(item))

    fn = _get_modal_function("score_red_submission")
    result = fn.remote(
        selected_stimuli=stimuli_list,
        batch_size=_get_batch_size(),
        submission_id=submission_id,
        submitter=submitter,
        hf_link=hf_link,
    )
    score = float(result.get("score", 0.0))
    return score, list(result.get("pairwise", []))


def fetch_volume_submissions(team: str | None = None) -> list[dict[str, Any]]:
    """Fetch submissions saved on the Modal volume.

    Used to sync submissions after a Space restart.
    """
    fn = _get_modal_function("list_submissions")
    return fn.remote(team=team)


def push_submissions_to_volume(submissions: list[dict[str, Any]]) -> dict[str, int]:
    """Push local submissions to the Modal volume.

    Used to backfill the volume after a Modal crash or volume wipe.
    Returns {"added": N, "skipped": M}.
    """
    fn = _get_modal_function("backfill_submissions")
    return fn.remote(submissions=submissions)