File size: 15,828 Bytes
fb2e410
 
b714bbd
 
 
 
fb2e410
 
 
 
 
 
b714bbd
fb2e410
b714bbd
fb2e410
b714bbd
fb2e410
19b064e
fb2e410
 
 
 
 
 
19b064e
 
fb2e410
 
 
 
 
b714bbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb2e410
 
b714bbd
fb2e410
 
 
 
 
b714bbd
fb2e410
 
 
b714bbd
fb2e410
19b064e
 
 
fb2e410
 
b714bbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623d481
b714bbd
623d481
b714bbd
19b064e
b714bbd
 
 
 
 
 
19b064e
b714bbd
 
19b064e
b714bbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19b064e
 
b714bbd
19b064e
 
b714bbd
 
19b064e
 
 
b714bbd
 
 
 
 
 
 
 
 
19b064e
b714bbd
19b064e
 
b714bbd
 
19b064e
b714bbd
 
 
 
 
 
 
 
 
 
 
 
19b064e
fb2e410
 
 
 
 
b714bbd
fb2e410
 
 
 
b714bbd
 
 
e13c0c4
b714bbd
 
 
 
 
 
 
 
 
 
 
 
 
e13c0c4
 
b714bbd
e13c0c4
b714bbd
 
 
 
 
19b064e
 
 
 
 
b714bbd
19b064e
 
 
 
 
b714bbd
 
 
fb2e410
b714bbd
 
19b064e
b714bbd
 
 
19b064e
fb2e410
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b714bbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb2e410
b714bbd
fb2e410
b714bbd
 
fb2e410
b714bbd
 
 
fb2e410
 
 
 
 
 
 
 
 
 
b714bbd
 
 
 
 
 
 
fb2e410
b714bbd
 
fb2e410
b714bbd
fb2e410
b714bbd
fb2e410
b714bbd
fb2e410
b714bbd
 
 
 
 
fb2e410
b714bbd
fb2e410
 
b714bbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19b064e
b714bbd
 
 
19b064e
b714bbd
 
19b064e
b714bbd
19b064e
 
b714bbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19b064e
b714bbd
19b064e
 
b714bbd
 
fb2e410
 
 
 
 
 
19b064e
b714bbd
 
fb2e410
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
"""Background worker for benchmark card generation.

Detects new benchmark folders in EEE_datastore, generates cards via
run_eee_pipeline(), and uploads them to evaleval/auto-benchmarkcards.

Uses Jenny's Entity Registry for canonical ID resolution and dedup.
"""

import json
import logging
import os
import tempfile
import time
from datetime import datetime, timezone
from functools import wraps
from pathlib import Path
from typing import Any, Optional

import requests
from huggingface_hub import HfApi, snapshot_download

logger = logging.getLogger("worker")

EEE_REPO = "evaleval/EEE_datastore"
CARDS_REPO = "evaleval/auto-benchmarkcards"
ENTITY_REGISTRY_URL = "https://evaleval-entity-registry.hf.space/api/v1"

# Persistent storage on HF Spaces (mounted volume).
# Falls back to local /tmp for development.
PERSISTENT_DIR = Path(os.environ.get("PERSISTENT_DIR", "/data"))
STATE_FILE = PERSISTENT_DIR / "state.json"

FORCE_REGENERATE = os.environ.get("FORCE_REGENERATE", "").lower() in ("1", "true", "yes")


# -- Retry decorator for transient failures --

def retry(max_attempts=3, delay=5, backoff=2):
    """Retry decorator with exponential backoff for transient failures."""
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for attempt in range(max_attempts):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    if attempt == max_attempts - 1:
                        raise
                    wait = delay * (backoff ** attempt)
                    logger.warning(
                        "%s failed (attempt %d/%d), retrying in %ds: %s",
                        func.__name__, attempt + 1, max_attempts, wait, e,
                    )
                    time.sleep(wait)
        return wrapper
    return decorator


# -- State management (atomic writes) --

def load_state() -> dict:
    """Load persistent state (known folders, job history, pending queue)."""
    if STATE_FILE.exists():
        try:
            return json.loads(STATE_FILE.read_text())
        except Exception:
            logger.exception("Failed to read state file, starting fresh")
    return {"known_folders": [], "jobs": [], "pending_folders": []}


def save_state(state: dict) -> None:
    """Save persistent state atomically (write-then-rename)."""
    PERSISTENT_DIR.mkdir(parents=True, exist_ok=True)
    tmp = STATE_FILE.with_suffix(".tmp")
    tmp.write_text(json.dumps(state, indent=2))
    tmp.rename(STATE_FILE)


def save_pending(folders: list[str]) -> None:
    """Add folders to the pending queue."""
    state = load_state()
    pending = state.get("pending_folders", [])
    for f in folders:
        if f not in pending:
            pending.append(f)
    state["pending_folders"] = pending
    save_state(state)


def pop_pending() -> list[str]:
    """Pop all pending folders from the queue."""
    state = load_state()
    pending = state.pop("pending_folders", [])
    state["pending_folders"] = []
    save_state(state)
    return pending


# -- Entity Registry --

_canonical_cache: dict[str, Optional[str]] = {}


def resolve_canonical_id(benchmark_name: str) -> Optional[str]:
    """Resolve benchmark name to canonical_id via Entity Registry.

    Returns canonical_id string (e.g. "math") or None if not found.
    Uses an in-memory cache to avoid repeated API calls within a job.
    """
    if benchmark_name in _canonical_cache:
        return _canonical_cache[benchmark_name]

    try:
        resp = requests.post(
            f"{ENTITY_REGISTRY_URL}/resolve",
            json={"raw_value": benchmark_name, "entity_type": "benchmark"},
            timeout=10,
        )
        resp.raise_for_status()
        data = resp.json()
        canonical_id = data.get("canonical_id")
        _canonical_cache[benchmark_name] = canonical_id
        if canonical_id:
            logger.info("Entity Registry: '%s' -> '%s'", benchmark_name, canonical_id)
        return canonical_id
    except Exception:
        logger.debug("Entity Registry lookup failed for '%s'", benchmark_name)
        _canonical_cache[benchmark_name] = None
        return None


def resolve_canonical_ids_batch(names: list[str]) -> dict[str, Optional[str]]:
    """Batch-resolve benchmark names to canonical_ids."""
    # Check cache first, only query uncached names
    uncached = [n for n in names if n not in _canonical_cache]
    if not uncached:
        return {n: _canonical_cache[n] for n in names}

    try:
        payload = [{"raw_value": n, "entity_type": "benchmark"} for n in uncached]
        resp = requests.post(
            f"{ENTITY_REGISTRY_URL}/resolve/batch",
            json=payload,
            timeout=30,
        )
        resp.raise_for_status()
        results = resp.json()
        for name, result in zip(uncached, results):
            canonical_id = result.get("canonical_id")
            _canonical_cache[name] = canonical_id
            if canonical_id:
                logger.info("Entity Registry: '%s' -> '%s'", name, canonical_id)
    except Exception:
        logger.warning("Entity Registry batch resolve failed, using fallback")
        for name in uncached:
            _canonical_cache[name] = None

    return {n: _canonical_cache.get(n) for n in names}


def _get_card_filename(benchmark_name: str) -> str:
    """Get the canonical filename for a benchmark card.

    Uses Entity Registry canonical_id when available, falls back to
    sanitize_benchmark_name from the main package.
    """
    canonical = resolve_canonical_id(benchmark_name)
    if canonical:
        return canonical

    from auto_benchmarkcard.output import sanitize_benchmark_name
    return sanitize_benchmark_name(benchmark_name).lower()


# -- EEE folder detection --

def _extract_folders(file_list: list[str]) -> set[str]:
    """Extract unique top-level folder names under data/."""
    folders = set()
    for path in file_list:
        parts = path.split("/")
        if len(parts) >= 2 and parts[0] == "data":
            folders.add(parts[1])
    return folders


@retry(max_attempts=3, delay=5)
def detect_new_benchmarks() -> list[str]:
    """Compare current EEE_datastore file listing against known state."""
    api = HfApi()
    all_files = api.list_repo_files(EEE_REPO, repo_type="dataset")

    current_folders = _extract_folders(all_files)
    state = load_state()
    known = set(state.get("known_folders", []))

    new_folders = sorted(current_folders - known)
    if new_folders:
        logger.info("Detected %d new folders: %s", len(new_folders), new_folders)
    else:
        logger.info("No new folders (known: %d, current: %d)", len(known), len(current_folders))

    return new_folders


# -- Download & upload --

@retry(max_attempts=3, delay=10)
def _download_folders(folder_names: list[str], target_dir: str) -> Path:
    """Download EEE folders into a shared temp directory."""
    patterns = [f"data/{f}/**/*.json" for f in folder_names]
    logger.info("Downloading %d EEE folders to %s", len(folder_names), target_dir)

    snapshot_download(
        repo_id=EEE_REPO,
        repo_type="dataset",
        local_dir=target_dir,
        allow_patterns=patterns,
    )

    return Path(target_dir) / "data"


@retry(max_attempts=3, delay=5)
def _upload_card(card: dict, benchmark_name: str, canonical_id: Optional[str] = None) -> bool:
    """Upload a generated card to evaleval/auto-benchmarkcards."""
    api = HfApi()
    filename = canonical_id or _get_card_filename(benchmark_name)
    remote_path = f"cards/{filename}.json"

    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
        json.dump(card, f, indent=2)
        tmp_path = f.name

    try:
        api.upload_file(
            path_or_fileobj=tmp_path,
            path_in_repo=remote_path,
            repo_id=CARDS_REPO,
            repo_type="dataset",
            commit_message=f"Auto-generated card: {benchmark_name}",
        )
        logger.info("Uploaded card to %s/%s", CARDS_REPO, remote_path)
        return True
    finally:
        try:
            os.unlink(tmp_path)
        except OSError:
            pass


@retry(max_attempts=2, delay=5)
def _list_existing_cards() -> set[str]:
    """List all card filenames (without extension) in the cards repo."""
    api = HfApi()
    all_files = api.list_repo_files(CARDS_REPO, repo_type="dataset")
    cards = set()
    for path in all_files:
        if path.startswith("cards/") and path.endswith(".json"):
            name = path[len("cards/"):-len(".json")]
            cards.add(name)
    return cards


# -- Main processing --

def _build_dedup_filter(
    benchmark_names: list[str],
    existing_cards: set[str],
) -> list[str]:
    """Return list of benchmark names that don't already have cards.

    Checks in order: Entity Registry canonical_id, exact fallback name,
    and parent prefix match (for 'Parent - Child' pattern).
    """
    if FORCE_REGENERATE:
        logger.info("FORCE_REGENERATE=true, skipping dedup")
        return benchmark_names

    # Batch-resolve all names
    canonical_map = resolve_canonical_ids_batch(benchmark_names)

    from auto_benchmarkcard.output import sanitize_benchmark_name

    new_benchmarks = []
    for name in benchmark_names:
        canonical = canonical_map.get(name)
        fallback = sanitize_benchmark_name(name).lower()

        # 1. Entity Registry canonical_id match
        if canonical and canonical in existing_cards:
            logger.info("Skipping '%s' (card exists as '%s')", name, canonical)
            continue

        # 2. Exact fallback name match
        if fallback in existing_cards:
            logger.info("Skipping '%s' (card exists as '%s')", name, fallback)
            continue

        # 3. Prefix match: 'MGSM - Bengali' -> check if 'mgsm' card exists
        if " - " in name:
            parent = name.split(" - ", 1)[0].strip()
            parent_lower = sanitize_benchmark_name(parent).lower()
            if parent_lower in existing_cards:
                logger.info("Skipping '%s' (parent card exists as '%s')", name, parent_lower)
                continue

        new_benchmarks.append(name)

    logger.info("Dedup: %d total, %d new, %d existing",
                len(benchmark_names), len(new_benchmarks),
                len(benchmark_names) - len(new_benchmarks))
    return new_benchmarks


def process_new_benchmarks(new_folders: list[str]) -> None:
    """Generate and upload cards for benchmarks in new folders.

    Delegates to run_eee_pipeline() for the actual generation, using a
    callback to upload each card as it's generated.
    """
    from auto_benchmarkcard.eee_workflow import run_eee_pipeline
    from auto_benchmarkcard.tools.eee.eee_tool import scan_eee_folder
    from auto_benchmarkcard.workflow import setup_logging_suppression

    setup_logging_suppression(debug_mode=False)

    state = load_state()
    job_record: dict[str, Any] = {
        "started_at": datetime.now(timezone.utc).isoformat(),
        "folders": new_folders,
        "results": [],
    }

    # Pre-fetch existing cards for dedup
    try:
        existing_cards = _list_existing_cards()
        logger.info("Found %d existing cards in %s", len(existing_cards), CARDS_REPO)
    except Exception:
        logger.warning("Failed to list existing cards, dedup disabled for this job")
        existing_cards = set()

    # Download all folders into one shared temp dir
    with tempfile.TemporaryDirectory(prefix="eee_batch_") as tmpdir:
        try:
            data_path = _download_folders(new_folders, tmpdir)
        except Exception:
            logger.exception("Failed to download EEE folders")
            job_record["results"].append({
                "folders": new_folders, "status": "download_failed",
            })
            job_record["completed_at"] = datetime.now(timezone.utc).isoformat()
            state["jobs"].append(job_record)
            state["jobs"] = state["jobs"][-50:]
            save_state(state)
            return

        # Scan to discover benchmark names for dedup
        try:
            scan_result = scan_eee_folder(str(data_path))
        except Exception:
            logger.exception("Failed to scan EEE data")
            job_record["results"].append({
                "folders": new_folders, "status": "scan_failed",
            })
            job_record["completed_at"] = datetime.now(timezone.utc).isoformat()
            state["jobs"].append(job_record)
            state["jobs"] = state["jobs"][-50:]
            save_state(state)
            return

        all_names = (
            list(scan_result.benchmarks.keys())
            + list(scan_result.composites.keys())
        )
        benchmarks_to_generate = _build_dedup_filter(all_names, existing_cards)

        if not benchmarks_to_generate:
            logger.info("All benchmarks already have cards, nothing to generate")
            job_record["results"].append({"status": "all_existing"})
        else:
            # Upload callback: called by run_eee_pipeline for each generated card
            def _on_card_generated(name: str, card: dict) -> None:
                canonical = resolve_canonical_id(name)

                # Enrich card metadata
                inner = card.get("benchmark_card", card)
                info = inner.get("card_info", {})
                info["source"] = "webhook"
                if canonical:
                    info["canonical_id"] = canonical
                inner["card_info"] = info

                try:
                    _upload_card(card, name, canonical_id=canonical)
                    job_record["results"].append({
                        "benchmark": name,
                        "canonical_id": canonical,
                        "status": "uploaded",
                    })
                except Exception:
                    logger.exception("Failed to upload card for %s", name)
                    job_record["results"].append({
                        "benchmark": name, "status": "upload_failed",
                    })

            # Run the unified pipeline
            summary = run_eee_pipeline(
                eee_path=str(data_path),
                output_path=str(PERSISTENT_DIR / "output"),
                benchmarks_filter=benchmarks_to_generate,
                on_card_generated=_on_card_generated,
            )

            # Record skipped/failed from pipeline summary
            for item in summary.get("skipped", []):
                job_record["results"].append({
                    "benchmark": item.get("benchmark", "unknown"),
                    "status": f"skipped:{item.get('reason', 'unknown')}",
                })
            for name in summary.get("failed", []):
                # Only add if not already recorded by callback
                existing = {r.get("benchmark") for r in job_record["results"]}
                if name not in existing:
                    job_record["results"].append({
                        "benchmark": name, "status": "generation_failed",
                    })

    # Mark folders as known
    for folder_name in new_folders:
        if folder_name not in state["known_folders"]:
            state["known_folders"].append(folder_name)

    job_record["completed_at"] = datetime.now(timezone.utc).isoformat()

    results = job_record["results"]
    uploaded = sum(1 for r in results if r.get("status") == "uploaded")
    failed = sum(1 for r in results if "failed" in r.get("status", ""))
    skipped = sum(1 for r in results if r.get("status", "").startswith("skipped"))
    logger.info("Job complete: %d uploaded, %d failed, %d skipped", uploaded, failed, skipped)

    state["jobs"].append(job_record)
    state["jobs"] = state["jobs"][-50:]
    save_state(state)