File size: 2,752 Bytes
fc329a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from __future__ import annotations

import json
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
DATA = ROOT.parent / 'SimplexTasks-12-data'

REQUIRED_DATA_FILES = [
    'README.md',
    'CITATION.cff',
    'LICENSE',
    'MANIFEST.json',
    'BENCHMARK_PROTOCOL.md',
    'EVALUATION_PROTOCOL.md',
    'ASSET_PROVENANCE_AND_TERMS.md',
    'REPRODUCIBILITY.md',
    'data/viewer/tasks.jsonl',
]
REQUIRED_REAL_DIRS = [
    'cifar10_softmax',
    'topics_20newsgroups',
    'samson_unmixing',
    'pbmc_pseudobulk',
    'utkface_ldl',
    'affectivetext',
]
REQUIRED_SYNTH_DIRS = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6']
REQUIRED_CODE_FILES = [
    'README.md',
    'CITATION.cff',
    'LICENSE',
    'environment.yml',
    'requirements.txt',
    'scripts/reproduce_tables.py',
    'scripts/reproduce_figures.py',
    'scripts/run_benchmark.py',
    'docs/reviewer_quickstart.md',
]


def require(path: Path, label: str, issues: list[str]) -> None:
    if not path.exists():
        issues.append(f'missing {label}: {path}')


def main() -> None:
    issues: list[str] = []
    require(DATA, 'dataset bundle', issues)
    for rel in REQUIRED_CODE_FILES:
        require(ROOT / rel, f'code file {rel}', issues)
    if DATA.exists():
        for rel in REQUIRED_DATA_FILES:
            require(DATA / rel, f'data file {rel}', issues)
        for name in REQUIRED_REAL_DIRS:
            require(DATA / 'data' / 'derived' / name, f'derived task {name}', issues)
        for name in REQUIRED_SYNTH_DIRS:
            require(DATA / 'data' / 'synthetic' / name, f'synthetic task {name}', issues)
        require(DATA / 'data' / 'summaries' / 'figure_inputs', 'figure input cache', issues)
        require(DATA / 'data' / 'summaries' / 'table_inputs', 'table input cache', issues)
        viewer_path = DATA / 'data' / 'viewer' / 'tasks.jsonl'
        if viewer_path.exists():
            rows = [json.loads(line) for line in viewer_path.read_text(encoding='utf-8').splitlines() if line.strip()]
            schemas = {tuple(row.keys()) for row in rows}
            if len(rows) != 12:
                issues.append(f'unexpected row count in data/viewer/tasks.jsonl: {len(rows)}')
            if len(schemas) != 1:
                issues.append('inconsistent JSONL schema in data/viewer/tasks.jsonl')
        manifest_path = DATA / 'MANIFEST.json'
        if manifest_path.exists():
            manifest = json.loads(manifest_path.read_text(encoding='utf-8'))
            if manifest.get('task_count') != 12:
                issues.append(f"unexpected task_count in MANIFEST.json: {manifest.get('task_count')}")
    if issues:
        raise SystemExit('\n'.join(issues))
    print('artifact integrity ok')


if __name__ == '__main__':
    main()