"""Build task cards and benchmark docs for the SimplexTasks-12 release.""" from __future__ import annotations import json from pathlib import Path from textwrap import dedent import yaml REPO_ROOT = Path(__file__).resolve().parents[1] RELEASE_ROOT = REPO_ROOT / "release" / "simplextasks-12" DOCS_DIR = RELEASE_ROOT / "docs" REAL_EXTRAS = { "cifar10_softmax": { "evaluation_role": "Extreme classification-style stress test for allocation failure under fixed class-probability vectors.", "target_definition": "A 10-way one-hot class distribution paired with a frozen ResNet-18 softmax cache.", "default_score": "Total variation / L1 on the simplex.", "default_stratification": "Entropy bins of the softmax prediction.", "limitations": "This is a classification proxy rather than a naturally continuous composition task. The release ships derived arrays only and does not mirror raw CIFAR-10 images.", }, "topics_20ng": { "evaluation_role": "Smooth-heterogeneity real task for topic-mixture prediction.", "target_definition": "Fixed 10-topic compositions constructed once from the 20 Newsgroups corpus, with a TF-IDF-to-topic-mixture kNN regressor as the predictor.", "default_score": "Aitchison distance.", "default_stratification": "Entropy bins of the predicted topic mixture.", "limitations": "Both the target and the predictor output are model-derived topic mixtures. The release exposes derived simplex arrays, not the raw text corpus.", }, "samson_unmixing": { "evaluation_role": "Natural low-dimensional compositional benchmark aligned with grouped repair.", "target_definition": "Three-endmember abundance compositions paired with a frozen NMF abundance estimator.", "default_score": "Aitchison distance.", "default_stratification": "Boundary bins on the abundance prediction.", "limitations": "The benchmark files are public and source-cited, but the upstream bundle did not include an explicit license file; downstream reuse should preserve attribution and the original benchmark context.", }, "pbmc3k_pseudobulk": { "evaluation_role": "Semi-synthetic control with known composition targets.", "target_definition": "Pseudobulk cell-type fractions generated from PBMC3K together with a fixed NNLS deconvolution predictor.", "default_score": "Aitchison distance.", "default_stratification": "Boundary bins on the predicted cell-type fractions.", "limitations": "This is a semi-synthetic control rather than a deployment-ready deconvolution benchmark. The grouped-repair story is sensitive to the pseudobulk concentration setting, which is documented in the paper appendix.", }, "utkface_age_ldl": { "evaluation_role": "Image-based label-distribution task with strong grouped heterogeneity.", "target_definition": "A 10-bin age label distribution constructed from UTKFace ages and predicted from thumbnail image features using PCA+kNN regression.", "default_score": "Aitchison distance.", "default_stratification": "Entropy bins of the predicted age distribution.", "limitations": "The underlying face images are not redistributed and remain subject to the source dataset's research-use constraints. The benchmark ships derived features and simplex arrays only.", }, "affectivetext_emotions": { "evaluation_role": "Small-sample weak-structure counterexample with a frozen language-model predictor.", "target_definition": "Normalized six-emotion gold scores from SemEval-2007 AffectiveText paired with a frozen zero-shot scorer cache.", "default_score": "Aitchison distance.", "default_stratification": "Boundary bins on the predicted emotion mixture.", "limitations": "The reported benchmark tables use a frozen closed-model cache; the release omits raw headlines and raw API responses. A fully open TF-IDF+SVD+kNN fallback cache-builder is included for local reproduction, but it is not identical to the main predictor used in the paper tables.", }, } SYNTH_EXTRAS = { "d1_homogeneous": { "evaluation_role": "Negative control with no residual-scale heterogeneity.", "limitations": "Intended to confirm that the benchmark does not manufacture allocation failures when the DGP is homogeneous.", }, "d2_pure_scale": { "evaluation_role": "Canonical smooth-scale heterogeneity regime used to motivate normalization-based repair.", "limitations": "This regime isolates scale variation only; it is not intended to cover bias-type misspecification or changing tail shape.", }, "d3_discrete_groups_aligned": { "evaluation_role": "Aligned discrete-group regime where group-wise calibration should have an inductive advantage.", "limitations": "The grouping is built into the DGP. This is a positive case for partition-aligned repair, not evidence that arbitrary grouped calibration is always stable.", }, "d4_model_bias": { "evaluation_role": "Bias-type counterexample where changing the wrapper alone is not enough.", "limitations": "This regime is not reducible to a single local scale field, so normalization cannot be expected to fully repair it.", }, "d5_heavy_tail": { "evaluation_role": "Smooth-scale regime with heavy tails to stress robustness beyond Gaussian residuals.", "limitations": "The regime is still synthetic and targeted; it does not span every possible deviation from the baseline score distribution.", }, "d6_high_k": { "evaluation_role": "High-dimensional simplex regime exposing the compute and allocation challenges that appear when K is large.", "limitations": "Benchmark-scale exact methods are too expensive here; the paper marks the exact-reference entries separately as D6 dagger cells.", }, } def load_json(path: Path) -> dict: with open(path) as f: return json.load(f) def load_yaml(path: Path) -> dict: with open(path) as f: return yaml.safe_load(f) def write(path: Path, text: str) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(text.rstrip() + "\n") def bullet_list(items: list[str]) -> str: return "\n".join(f"- {item}" for item in items) def synthetic_task_card(task_dir: Path, metadata: dict, config: dict, extra: dict) -> str: evaluation = config["evaluation"] dgp = config["dgp"] data = config["data"] lines = [ f"# {metadata['task_name']} Task Card", "", f"- Task ID: `{metadata['task_id']}`", "- Subset: synthetic", f"- Samples: `{metadata['n_samples']}`", f"- Simplex dimension: `{metadata['simplex_dim']}`", f"- Predictor: {metadata['predictor']}", f"- Regime label: {metadata['regime_label']}", "", "## Evaluation Role", "", extra["evaluation_role"], "", "## DGP Summary", "", f"- DGP family: `{dgp['name']}`", "- Default score: Aitchison distance", f"- Default stratification: `{evaluation['strata_method']}` with `{evaluation['n_strata']}` strata", f"- Calibration size: `{data['n_cal']}`", f"- Test size: `{data['n_test']}`", f"- Repetitions: `{data['n_rep']}`", "", "## Release Contents", "", bullet_list([f"`{name}`" for name in metadata["available_arrays"]]), "", "## Provenance And Rebuild", "", f"- Source asset: {metadata['source_asset']}", f"- Config file: `{metadata['config_file']}`", f"- Redistribution: `{metadata['redistribution']}`", f"- Seed: `{metadata['seed']}`", "", "## Limitations", "", extra["limitations"], ] return "\n".join(lines) def real_task_card(task_dir: Path, metadata: dict, extra: dict) -> str: lines = [ f"# {metadata['task_name']} Task Card", "", f"- Task ID: `{metadata['task_id']}`", "- Subset: real", f"- Samples: `{metadata['n_samples']}`", f"- Simplex dimension: `{metadata['simplex_dim']}`", f"- Predictor: {metadata['predictor']}", "", "## Evaluation Role", "", extra["evaluation_role"], "", "## Target And Predictor", "", extra["target_definition"], "", "## Default Benchmark Settings", "", f"- Default score: {extra['default_score']}", f"- Default stratification: {extra['default_stratification']}", f"- Redistribution: `{metadata['redistribution']}`", "", "## Release Contents", "", bullet_list([f"`{name}`" for name in metadata["available_arrays"]]), "", "## Provenance And Usage Notes", "", f"- Source asset: {metadata['source_asset']}", f"- Metadata note: {metadata['notes']}", "", "## Limitations", "", extra["limitations"], ] return "\n".join(lines) def build_task_cards() -> None: for task_dir in sorted((RELEASE_ROOT / "real").glob("*")): metadata = load_json(task_dir / "metadata.json") extra = REAL_EXTRAS[metadata["task_id"]] write(task_dir / "task_card.md", real_task_card(task_dir, metadata, extra)) for task_dir in sorted((RELEASE_ROOT / "synthetic").glob("*")): metadata = load_json(task_dir / "metadata.json") config = load_yaml(task_dir / "config.yaml") extra = SYNTH_EXTRAS[metadata["task_id"]] write(task_dir / "task_card.md", synthetic_task_card(task_dir, metadata, config, extra)) def build_docs() -> None: benchmark_card = dedent( """ # SimplexTasks-12 Benchmark Card SimplexTasks-12 is the processed task collection behind the SimplexUQ benchmark. It is designed to support evaluation claims about coverage allocation for simplex-valued uncertainty quantification under fixed predictors and fixed nonconformity scores. ## Supported Claims - Whether a fixed simplex-valued predictor exhibits allocation failure beyond its marginal coverage. - Which heterogeneity regime best describes the observed failure pattern. - Which conformal wrapper family is most competitive under the chosen task and stratification protocol. ## Claims The Benchmark Does Not Support - Universal wrapper rankings across all simplex tasks. - Deployment-readiness claims for any predictor. - Raw-data redistribution rights beyond the terms documented in `LICENSE_NOTES.md`. ## Benchmark Contents - 6 synthetic regimes with canonical `task.npz` bundles and copied YAML configs. - 6 fixed-predictor real tasks with cached `(Y, U)` arrays or derived features. - Per-task `task_card.md` files and `metadata.json` provenance records. - Release-level rebuild instructions for the paper tables and figures. ## Reproducibility Contract - Benchmark evaluation always operates on frozen predictor outputs. - Default stratification rules are fixed before wrapper comparison. - Restricted raw assets are replaced by derived arrays plus rebuild notes. - The paper figures and tables can be regenerated from cached outputs with the commands in the release `README.md`. ## Responsible Use Some tasks rely on non-redistributable source assets or derived features from sensitive domains such as face images. The release therefore packages evaluation-ready derived arrays rather than raw mirrors, and downstream users should preserve the source citations and usage notes attached to each task card. """ ).strip() evaluation_protocol = dedent( """ # SimplexTasks-12 Evaluation Protocol ## Fixed-Predictor Principle Every benchmark slice fixes the predictor first and then varies only the conformal wrapper. This keeps the evaluation target on uncertainty allocation rather than on predictor training. ## Default Scores - Real and synthetic composition tasks use Aitchison distance unless the target lies on simplex vertices. - CIFAR-10 uses total variation / L1 because Aitchison distance is ill-defined at the one-hot boundary. ## Stratification Rules - Default strata are entropy bins, boundary bins, or task-specific dominant-group partitions. - Sensitivity sweeps use fixed alternative stratifications defined from cached prediction vectors only. - Stratification maps are not tuned per wrapper and do not depend on calibration/test responses. ## Main Metrics - Marginal coverage. - Max disparity across prediction-space strata. - Worst-stratum coverage. - Coverage variance. - SSCV and mean radius; low-dimensional synthetic tasks also report a simplex-volume ratio. ## Wrapper Families - Global split conformal. - Group-wise / Mondrian conformal. - Two-stage normalization. - Exact or leave-one-out references where affordable. - Diagnostic variants such as OneShot, TrainRes, and the current weighted implementation. ## Output Interpretation The benchmark is designed to compare wrapper families under visible allocation-efficiency-compute tradeoffs. It should not be reduced to a single leaderboard or read as a conditional-coverage certification protocol. """ ).strip() write(DOCS_DIR / "benchmark_card.md", benchmark_card) write(DOCS_DIR / "evaluation_protocol.md", evaluation_protocol) def main() -> None: build_task_cards() build_docs() print("Built SimplexTasks-12 task cards and docs.") if __name__ == "__main__": main()