simplexuq-code / scripts /check_artifact_integrity.py
anonymous0523ly's picture
Initial anonymous code release
fc329a3 verified
raw
history blame
2.75 kB
from __future__ import annotations
import json
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DATA = ROOT.parent / 'SimplexTasks-12-data'
REQUIRED_DATA_FILES = [
'README.md',
'CITATION.cff',
'LICENSE',
'MANIFEST.json',
'BENCHMARK_PROTOCOL.md',
'EVALUATION_PROTOCOL.md',
'ASSET_PROVENANCE_AND_TERMS.md',
'REPRODUCIBILITY.md',
'data/viewer/tasks.jsonl',
]
REQUIRED_REAL_DIRS = [
'cifar10_softmax',
'topics_20newsgroups',
'samson_unmixing',
'pbmc_pseudobulk',
'utkface_ldl',
'affectivetext',
]
REQUIRED_SYNTH_DIRS = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6']
REQUIRED_CODE_FILES = [
'README.md',
'CITATION.cff',
'LICENSE',
'environment.yml',
'requirements.txt',
'scripts/reproduce_tables.py',
'scripts/reproduce_figures.py',
'scripts/run_benchmark.py',
'docs/reviewer_quickstart.md',
]
def require(path: Path, label: str, issues: list[str]) -> None:
if not path.exists():
issues.append(f'missing {label}: {path}')
def main() -> None:
issues: list[str] = []
require(DATA, 'dataset bundle', issues)
for rel in REQUIRED_CODE_FILES:
require(ROOT / rel, f'code file {rel}', issues)
if DATA.exists():
for rel in REQUIRED_DATA_FILES:
require(DATA / rel, f'data file {rel}', issues)
for name in REQUIRED_REAL_DIRS:
require(DATA / 'data' / 'derived' / name, f'derived task {name}', issues)
for name in REQUIRED_SYNTH_DIRS:
require(DATA / 'data' / 'synthetic' / name, f'synthetic task {name}', issues)
require(DATA / 'data' / 'summaries' / 'figure_inputs', 'figure input cache', issues)
require(DATA / 'data' / 'summaries' / 'table_inputs', 'table input cache', issues)
viewer_path = DATA / 'data' / 'viewer' / 'tasks.jsonl'
if viewer_path.exists():
rows = [json.loads(line) for line in viewer_path.read_text(encoding='utf-8').splitlines() if line.strip()]
schemas = {tuple(row.keys()) for row in rows}
if len(rows) != 12:
issues.append(f'unexpected row count in data/viewer/tasks.jsonl: {len(rows)}')
if len(schemas) != 1:
issues.append('inconsistent JSONL schema in data/viewer/tasks.jsonl')
manifest_path = DATA / 'MANIFEST.json'
if manifest_path.exists():
manifest = json.loads(manifest_path.read_text(encoding='utf-8'))
if manifest.get('task_count') != 12:
issues.append(f"unexpected task_count in MANIFEST.json: {manifest.get('task_count')}")
if issues:
raise SystemExit('\n'.join(issues))
print('artifact integrity ok')
if __name__ == '__main__':
main()