| """Build task cards and benchmark docs for the SimplexTasks-12 release.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
| from textwrap import dedent |
|
|
| import yaml |
|
|
|
|
| REPO_ROOT = Path(__file__).resolve().parents[1] |
| RELEASE_ROOT = REPO_ROOT / "release" / "simplextasks-12" |
| DOCS_DIR = RELEASE_ROOT / "docs" |
|
|
| REAL_EXTRAS = { |
| "cifar10_softmax": { |
| "evaluation_role": "Extreme classification-style stress test for allocation failure under fixed class-probability vectors.", |
| "target_definition": "A 10-way one-hot class distribution paired with a frozen ResNet-18 softmax cache.", |
| "default_score": "Total variation / L1 on the simplex.", |
| "default_stratification": "Entropy bins of the softmax prediction.", |
| "limitations": "This is a classification proxy rather than a naturally continuous composition task. The release ships derived arrays only and does not mirror raw CIFAR-10 images.", |
| }, |
| "topics_20ng": { |
| "evaluation_role": "Smooth-heterogeneity real task for topic-mixture prediction.", |
| "target_definition": "Fixed 10-topic compositions constructed once from the 20 Newsgroups corpus, with a TF-IDF-to-topic-mixture kNN regressor as the predictor.", |
| "default_score": "Aitchison distance.", |
| "default_stratification": "Entropy bins of the predicted topic mixture.", |
| "limitations": "Both the target and the predictor output are model-derived topic mixtures. The release exposes derived simplex arrays, not the raw text corpus.", |
| }, |
| "samson_unmixing": { |
| "evaluation_role": "Natural low-dimensional compositional benchmark aligned with grouped repair.", |
| "target_definition": "Three-endmember abundance compositions paired with a frozen NMF abundance estimator.", |
| "default_score": "Aitchison distance.", |
| "default_stratification": "Boundary bins on the abundance prediction.", |
| "limitations": "The benchmark files are public and source-cited, but the upstream bundle did not include an explicit license file; downstream reuse should preserve attribution and the original benchmark context.", |
| }, |
| "pbmc3k_pseudobulk": { |
| "evaluation_role": "Semi-synthetic control with known composition targets.", |
| "target_definition": "Pseudobulk cell-type fractions generated from PBMC3K together with a fixed NNLS deconvolution predictor.", |
| "default_score": "Aitchison distance.", |
| "default_stratification": "Boundary bins on the predicted cell-type fractions.", |
| "limitations": "This is a semi-synthetic control rather than a deployment-ready deconvolution benchmark. The grouped-repair story is sensitive to the pseudobulk concentration setting, which is documented in the paper appendix.", |
| }, |
| "utkface_age_ldl": { |
| "evaluation_role": "Image-based label-distribution task with strong grouped heterogeneity.", |
| "target_definition": "A 10-bin age label distribution constructed from UTKFace ages and predicted from thumbnail image features using PCA+kNN regression.", |
| "default_score": "Aitchison distance.", |
| "default_stratification": "Entropy bins of the predicted age distribution.", |
| "limitations": "The underlying face images are not redistributed and remain subject to the source dataset's research-use constraints. The benchmark ships derived features and simplex arrays only.", |
| }, |
| "affectivetext_emotions": { |
| "evaluation_role": "Small-sample weak-structure counterexample with a frozen language-model predictor.", |
| "target_definition": "Normalized six-emotion gold scores from SemEval-2007 AffectiveText paired with a frozen zero-shot scorer cache.", |
| "default_score": "Aitchison distance.", |
| "default_stratification": "Boundary bins on the predicted emotion mixture.", |
| "limitations": "The reported benchmark tables use a frozen closed-model cache; the release omits raw headlines and raw API responses. A fully open TF-IDF+SVD+kNN fallback cache-builder is included for local reproduction, but it is not identical to the main predictor used in the paper tables.", |
| }, |
| } |
|
|
| SYNTH_EXTRAS = { |
| "d1_homogeneous": { |
| "evaluation_role": "Negative control with no residual-scale heterogeneity.", |
| "limitations": "Intended to confirm that the benchmark does not manufacture allocation failures when the DGP is homogeneous.", |
| }, |
| "d2_pure_scale": { |
| "evaluation_role": "Canonical smooth-scale heterogeneity regime used to motivate normalization-based repair.", |
| "limitations": "This regime isolates scale variation only; it is not intended to cover bias-type misspecification or changing tail shape.", |
| }, |
| "d3_discrete_groups_aligned": { |
| "evaluation_role": "Aligned discrete-group regime where group-wise calibration should have an inductive advantage.", |
| "limitations": "The grouping is built into the DGP. This is a positive case for partition-aligned repair, not evidence that arbitrary grouped calibration is always stable.", |
| }, |
| "d4_model_bias": { |
| "evaluation_role": "Bias-type counterexample where changing the wrapper alone is not enough.", |
| "limitations": "This regime is not reducible to a single local scale field, so normalization cannot be expected to fully repair it.", |
| }, |
| "d5_heavy_tail": { |
| "evaluation_role": "Smooth-scale regime with heavy tails to stress robustness beyond Gaussian residuals.", |
| "limitations": "The regime is still synthetic and targeted; it does not span every possible deviation from the baseline score distribution.", |
| }, |
| "d6_high_k": { |
| "evaluation_role": "High-dimensional simplex regime exposing the compute and allocation challenges that appear when K is large.", |
| "limitations": "Benchmark-scale exact methods are too expensive here; the paper marks the exact-reference entries separately as D6 dagger cells.", |
| }, |
| } |
|
|
|
|
| def load_json(path: Path) -> dict: |
| with open(path) as f: |
| return json.load(f) |
|
|
|
|
| def load_yaml(path: Path) -> dict: |
| with open(path) as f: |
| return yaml.safe_load(f) |
|
|
|
|
| def write(path: Path, text: str) -> None: |
| path.parent.mkdir(parents=True, exist_ok=True) |
| path.write_text(text.rstrip() + "\n") |
|
|
|
|
| def bullet_list(items: list[str]) -> str: |
| return "\n".join(f"- {item}" for item in items) |
|
|
|
|
| def synthetic_task_card(task_dir: Path, metadata: dict, config: dict, extra: dict) -> str: |
| evaluation = config["evaluation"] |
| dgp = config["dgp"] |
| data = config["data"] |
| lines = [ |
| f"# {metadata['task_name']} Task Card", |
| "", |
| f"- Task ID: `{metadata['task_id']}`", |
| "- Subset: synthetic", |
| f"- Samples: `{metadata['n_samples']}`", |
| f"- Simplex dimension: `{metadata['simplex_dim']}`", |
| f"- Predictor: {metadata['predictor']}", |
| f"- Regime label: {metadata['regime_label']}", |
| "", |
| "## Evaluation Role", |
| "", |
| extra["evaluation_role"], |
| "", |
| "## DGP Summary", |
| "", |
| f"- DGP family: `{dgp['name']}`", |
| "- Default score: Aitchison distance", |
| f"- Default stratification: `{evaluation['strata_method']}` with `{evaluation['n_strata']}` strata", |
| f"- Calibration size: `{data['n_cal']}`", |
| f"- Test size: `{data['n_test']}`", |
| f"- Repetitions: `{data['n_rep']}`", |
| "", |
| "## Release Contents", |
| "", |
| bullet_list([f"`{name}`" for name in metadata["available_arrays"]]), |
| "", |
| "## Provenance And Rebuild", |
| "", |
| f"- Source asset: {metadata['source_asset']}", |
| f"- Config file: `{metadata['config_file']}`", |
| f"- Redistribution: `{metadata['redistribution']}`", |
| f"- Seed: `{metadata['seed']}`", |
| "", |
| "## Limitations", |
| "", |
| extra["limitations"], |
| ] |
| return "\n".join(lines) |
|
|
|
|
| def real_task_card(task_dir: Path, metadata: dict, extra: dict) -> str: |
| lines = [ |
| f"# {metadata['task_name']} Task Card", |
| "", |
| f"- Task ID: `{metadata['task_id']}`", |
| "- Subset: real", |
| f"- Samples: `{metadata['n_samples']}`", |
| f"- Simplex dimension: `{metadata['simplex_dim']}`", |
| f"- Predictor: {metadata['predictor']}", |
| "", |
| "## Evaluation Role", |
| "", |
| extra["evaluation_role"], |
| "", |
| "## Target And Predictor", |
| "", |
| extra["target_definition"], |
| "", |
| "## Default Benchmark Settings", |
| "", |
| f"- Default score: {extra['default_score']}", |
| f"- Default stratification: {extra['default_stratification']}", |
| f"- Redistribution: `{metadata['redistribution']}`", |
| "", |
| "## Release Contents", |
| "", |
| bullet_list([f"`{name}`" for name in metadata["available_arrays"]]), |
| "", |
| "## Provenance And Usage Notes", |
| "", |
| f"- Source asset: {metadata['source_asset']}", |
| f"- Metadata note: {metadata['notes']}", |
| "", |
| "## Limitations", |
| "", |
| extra["limitations"], |
| ] |
| return "\n".join(lines) |
|
|
|
|
| def build_task_cards() -> None: |
| for task_dir in sorted((RELEASE_ROOT / "real").glob("*")): |
| metadata = load_json(task_dir / "metadata.json") |
| extra = REAL_EXTRAS[metadata["task_id"]] |
| write(task_dir / "task_card.md", real_task_card(task_dir, metadata, extra)) |
|
|
| for task_dir in sorted((RELEASE_ROOT / "synthetic").glob("*")): |
| metadata = load_json(task_dir / "metadata.json") |
| config = load_yaml(task_dir / "config.yaml") |
| extra = SYNTH_EXTRAS[metadata["task_id"]] |
| write(task_dir / "task_card.md", synthetic_task_card(task_dir, metadata, config, extra)) |
|
|
|
|
| def build_docs() -> None: |
| benchmark_card = dedent( |
| """ |
| # SimplexTasks-12 Benchmark Card |
| |
| SimplexTasks-12 is the processed task collection behind the SimplexUQ benchmark. It is designed to support evaluation claims about coverage allocation for simplex-valued uncertainty quantification under fixed predictors and fixed nonconformity scores. |
| |
| ## Supported Claims |
| |
| - Whether a fixed simplex-valued predictor exhibits allocation failure beyond its marginal coverage. |
| - Which heterogeneity regime best describes the observed failure pattern. |
| - Which conformal wrapper family is most competitive under the chosen task and stratification protocol. |
| |
| ## Claims The Benchmark Does Not Support |
| |
| - Universal wrapper rankings across all simplex tasks. |
| - Deployment-readiness claims for any predictor. |
| - Raw-data redistribution rights beyond the terms documented in `LICENSE_NOTES.md`. |
| |
| ## Benchmark Contents |
| |
| - 6 synthetic regimes with canonical `task.npz` bundles and copied YAML configs. |
| - 6 fixed-predictor real tasks with cached `(Y, U)` arrays or derived features. |
| - Per-task `task_card.md` files and `metadata.json` provenance records. |
| - Release-level rebuild instructions for the paper tables and figures. |
| |
| ## Reproducibility Contract |
| |
| - Benchmark evaluation always operates on frozen predictor outputs. |
| - Default stratification rules are fixed before wrapper comparison. |
| - Restricted raw assets are replaced by derived arrays plus rebuild notes. |
| - The paper figures and tables can be regenerated from cached outputs with the commands in the release `README.md`. |
| |
| ## Responsible Use |
| |
| Some tasks rely on non-redistributable source assets or derived features from sensitive domains such as face images. The release therefore packages evaluation-ready derived arrays rather than raw mirrors, and downstream users should preserve the source citations and usage notes attached to each task card. |
| """ |
| ).strip() |
|
|
| evaluation_protocol = dedent( |
| """ |
| # SimplexTasks-12 Evaluation Protocol |
| |
| ## Fixed-Predictor Principle |
| |
| Every benchmark slice fixes the predictor first and then varies only the conformal wrapper. This keeps the evaluation target on uncertainty allocation rather than on predictor training. |
| |
| ## Default Scores |
| |
| - Real and synthetic composition tasks use Aitchison distance unless the target lies on simplex vertices. |
| - CIFAR-10 uses total variation / L1 because Aitchison distance is ill-defined at the one-hot boundary. |
| |
| ## Stratification Rules |
| |
| - Default strata are entropy bins, boundary bins, or task-specific dominant-group partitions. |
| - Sensitivity sweeps use fixed alternative stratifications defined from cached prediction vectors only. |
| - Stratification maps are not tuned per wrapper and do not depend on calibration/test responses. |
| |
| ## Main Metrics |
| |
| - Marginal coverage. |
| - Max disparity across prediction-space strata. |
| - Worst-stratum coverage. |
| - Coverage variance. |
| - SSCV and mean radius; low-dimensional synthetic tasks also report a simplex-volume ratio. |
| |
| ## Wrapper Families |
| |
| - Global split conformal. |
| - Group-wise / Mondrian conformal. |
| - Two-stage normalization. |
| - Exact or leave-one-out references where affordable. |
| - Diagnostic variants such as OneShot, TrainRes, and the current weighted implementation. |
| |
| ## Output Interpretation |
| |
| The benchmark is designed to compare wrapper families under visible allocation-efficiency-compute tradeoffs. It should not be reduced to a single leaderboard or read as a conditional-coverage certification protocol. |
| """ |
| ).strip() |
|
|
| write(DOCS_DIR / "benchmark_card.md", benchmark_card) |
| write(DOCS_DIR / "evaluation_protocol.md", evaluation_protocol) |
|
|
|
|
| def main() -> None: |
| build_task_cards() |
| build_docs() |
| print("Built SimplexTasks-12 task cards and docs.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|