simplexuq-code / scripts /build_simplextasks_docs.py
anonymous0523ly's picture
Initial anonymous code release
fc329a3 verified
raw
history blame
13.9 kB
"""Build task cards and benchmark docs for the SimplexTasks-12 release."""
from __future__ import annotations
import json
from pathlib import Path
from textwrap import dedent
import yaml
REPO_ROOT = Path(__file__).resolve().parents[1]
RELEASE_ROOT = REPO_ROOT / "release" / "simplextasks-12"
DOCS_DIR = RELEASE_ROOT / "docs"
REAL_EXTRAS = {
"cifar10_softmax": {
"evaluation_role": "Extreme classification-style stress test for allocation failure under fixed class-probability vectors.",
"target_definition": "A 10-way one-hot class distribution paired with a frozen ResNet-18 softmax cache.",
"default_score": "Total variation / L1 on the simplex.",
"default_stratification": "Entropy bins of the softmax prediction.",
"limitations": "This is a classification proxy rather than a naturally continuous composition task. The release ships derived arrays only and does not mirror raw CIFAR-10 images.",
},
"topics_20ng": {
"evaluation_role": "Smooth-heterogeneity real task for topic-mixture prediction.",
"target_definition": "Fixed 10-topic compositions constructed once from the 20 Newsgroups corpus, with a TF-IDF-to-topic-mixture kNN regressor as the predictor.",
"default_score": "Aitchison distance.",
"default_stratification": "Entropy bins of the predicted topic mixture.",
"limitations": "Both the target and the predictor output are model-derived topic mixtures. The release exposes derived simplex arrays, not the raw text corpus.",
},
"samson_unmixing": {
"evaluation_role": "Natural low-dimensional compositional benchmark aligned with grouped repair.",
"target_definition": "Three-endmember abundance compositions paired with a frozen NMF abundance estimator.",
"default_score": "Aitchison distance.",
"default_stratification": "Boundary bins on the abundance prediction.",
"limitations": "The benchmark files are public and source-cited, but the upstream bundle did not include an explicit license file; downstream reuse should preserve attribution and the original benchmark context.",
},
"pbmc3k_pseudobulk": {
"evaluation_role": "Semi-synthetic control with known composition targets.",
"target_definition": "Pseudobulk cell-type fractions generated from PBMC3K together with a fixed NNLS deconvolution predictor.",
"default_score": "Aitchison distance.",
"default_stratification": "Boundary bins on the predicted cell-type fractions.",
"limitations": "This is a semi-synthetic control rather than a deployment-ready deconvolution benchmark. The grouped-repair story is sensitive to the pseudobulk concentration setting, which is documented in the paper appendix.",
},
"utkface_age_ldl": {
"evaluation_role": "Image-based label-distribution task with strong grouped heterogeneity.",
"target_definition": "A 10-bin age label distribution constructed from UTKFace ages and predicted from thumbnail image features using PCA+kNN regression.",
"default_score": "Aitchison distance.",
"default_stratification": "Entropy bins of the predicted age distribution.",
"limitations": "The underlying face images are not redistributed and remain subject to the source dataset's research-use constraints. The benchmark ships derived features and simplex arrays only.",
},
"affectivetext_emotions": {
"evaluation_role": "Small-sample weak-structure counterexample with a frozen language-model predictor.",
"target_definition": "Normalized six-emotion gold scores from SemEval-2007 AffectiveText paired with a frozen zero-shot scorer cache.",
"default_score": "Aitchison distance.",
"default_stratification": "Boundary bins on the predicted emotion mixture.",
"limitations": "The reported benchmark tables use a frozen closed-model cache; the release omits raw headlines and raw API responses. A fully open TF-IDF+SVD+kNN fallback cache-builder is included for local reproduction, but it is not identical to the main predictor used in the paper tables.",
},
}
SYNTH_EXTRAS = {
"d1_homogeneous": {
"evaluation_role": "Negative control with no residual-scale heterogeneity.",
"limitations": "Intended to confirm that the benchmark does not manufacture allocation failures when the DGP is homogeneous.",
},
"d2_pure_scale": {
"evaluation_role": "Canonical smooth-scale heterogeneity regime used to motivate normalization-based repair.",
"limitations": "This regime isolates scale variation only; it is not intended to cover bias-type misspecification or changing tail shape.",
},
"d3_discrete_groups_aligned": {
"evaluation_role": "Aligned discrete-group regime where group-wise calibration should have an inductive advantage.",
"limitations": "The grouping is built into the DGP. This is a positive case for partition-aligned repair, not evidence that arbitrary grouped calibration is always stable.",
},
"d4_model_bias": {
"evaluation_role": "Bias-type counterexample where changing the wrapper alone is not enough.",
"limitations": "This regime is not reducible to a single local scale field, so normalization cannot be expected to fully repair it.",
},
"d5_heavy_tail": {
"evaluation_role": "Smooth-scale regime with heavy tails to stress robustness beyond Gaussian residuals.",
"limitations": "The regime is still synthetic and targeted; it does not span every possible deviation from the baseline score distribution.",
},
"d6_high_k": {
"evaluation_role": "High-dimensional simplex regime exposing the compute and allocation challenges that appear when K is large.",
"limitations": "Benchmark-scale exact methods are too expensive here; the paper marks the exact-reference entries separately as D6 dagger cells.",
},
}
def load_json(path: Path) -> dict:
with open(path) as f:
return json.load(f)
def load_yaml(path: Path) -> dict:
with open(path) as f:
return yaml.safe_load(f)
def write(path: Path, text: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(text.rstrip() + "\n")
def bullet_list(items: list[str]) -> str:
return "\n".join(f"- {item}" for item in items)
def synthetic_task_card(task_dir: Path, metadata: dict, config: dict, extra: dict) -> str:
evaluation = config["evaluation"]
dgp = config["dgp"]
data = config["data"]
lines = [
f"# {metadata['task_name']} Task Card",
"",
f"- Task ID: `{metadata['task_id']}`",
"- Subset: synthetic",
f"- Samples: `{metadata['n_samples']}`",
f"- Simplex dimension: `{metadata['simplex_dim']}`",
f"- Predictor: {metadata['predictor']}",
f"- Regime label: {metadata['regime_label']}",
"",
"## Evaluation Role",
"",
extra["evaluation_role"],
"",
"## DGP Summary",
"",
f"- DGP family: `{dgp['name']}`",
"- Default score: Aitchison distance",
f"- Default stratification: `{evaluation['strata_method']}` with `{evaluation['n_strata']}` strata",
f"- Calibration size: `{data['n_cal']}`",
f"- Test size: `{data['n_test']}`",
f"- Repetitions: `{data['n_rep']}`",
"",
"## Release Contents",
"",
bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
"",
"## Provenance And Rebuild",
"",
f"- Source asset: {metadata['source_asset']}",
f"- Config file: `{metadata['config_file']}`",
f"- Redistribution: `{metadata['redistribution']}`",
f"- Seed: `{metadata['seed']}`",
"",
"## Limitations",
"",
extra["limitations"],
]
return "\n".join(lines)
def real_task_card(task_dir: Path, metadata: dict, extra: dict) -> str:
lines = [
f"# {metadata['task_name']} Task Card",
"",
f"- Task ID: `{metadata['task_id']}`",
"- Subset: real",
f"- Samples: `{metadata['n_samples']}`",
f"- Simplex dimension: `{metadata['simplex_dim']}`",
f"- Predictor: {metadata['predictor']}",
"",
"## Evaluation Role",
"",
extra["evaluation_role"],
"",
"## Target And Predictor",
"",
extra["target_definition"],
"",
"## Default Benchmark Settings",
"",
f"- Default score: {extra['default_score']}",
f"- Default stratification: {extra['default_stratification']}",
f"- Redistribution: `{metadata['redistribution']}`",
"",
"## Release Contents",
"",
bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
"",
"## Provenance And Usage Notes",
"",
f"- Source asset: {metadata['source_asset']}",
f"- Metadata note: {metadata['notes']}",
"",
"## Limitations",
"",
extra["limitations"],
]
return "\n".join(lines)
def build_task_cards() -> None:
for task_dir in sorted((RELEASE_ROOT / "real").glob("*")):
metadata = load_json(task_dir / "metadata.json")
extra = REAL_EXTRAS[metadata["task_id"]]
write(task_dir / "task_card.md", real_task_card(task_dir, metadata, extra))
for task_dir in sorted((RELEASE_ROOT / "synthetic").glob("*")):
metadata = load_json(task_dir / "metadata.json")
config = load_yaml(task_dir / "config.yaml")
extra = SYNTH_EXTRAS[metadata["task_id"]]
write(task_dir / "task_card.md", synthetic_task_card(task_dir, metadata, config, extra))
def build_docs() -> None:
benchmark_card = dedent(
"""
# SimplexTasks-12 Benchmark Card
SimplexTasks-12 is the processed task collection behind the SimplexUQ benchmark. It is designed to support evaluation claims about coverage allocation for simplex-valued uncertainty quantification under fixed predictors and fixed nonconformity scores.
## Supported Claims
- Whether a fixed simplex-valued predictor exhibits allocation failure beyond its marginal coverage.
- Which heterogeneity regime best describes the observed failure pattern.
- Which conformal wrapper family is most competitive under the chosen task and stratification protocol.
## Claims The Benchmark Does Not Support
- Universal wrapper rankings across all simplex tasks.
- Deployment-readiness claims for any predictor.
- Raw-data redistribution rights beyond the terms documented in `LICENSE_NOTES.md`.
## Benchmark Contents
- 6 synthetic regimes with canonical `task.npz` bundles and copied YAML configs.
- 6 fixed-predictor real tasks with cached `(Y, U)` arrays or derived features.
- Per-task `task_card.md` files and `metadata.json` provenance records.
- Release-level rebuild instructions for the paper tables and figures.
## Reproducibility Contract
- Benchmark evaluation always operates on frozen predictor outputs.
- Default stratification rules are fixed before wrapper comparison.
- Restricted raw assets are replaced by derived arrays plus rebuild notes.
- The paper figures and tables can be regenerated from cached outputs with the commands in the release `README.md`.
## Responsible Use
Some tasks rely on non-redistributable source assets or derived features from sensitive domains such as face images. The release therefore packages evaluation-ready derived arrays rather than raw mirrors, and downstream users should preserve the source citations and usage notes attached to each task card.
"""
).strip()
evaluation_protocol = dedent(
"""
# SimplexTasks-12 Evaluation Protocol
## Fixed-Predictor Principle
Every benchmark slice fixes the predictor first and then varies only the conformal wrapper. This keeps the evaluation target on uncertainty allocation rather than on predictor training.
## Default Scores
- Real and synthetic composition tasks use Aitchison distance unless the target lies on simplex vertices.
- CIFAR-10 uses total variation / L1 because Aitchison distance is ill-defined at the one-hot boundary.
## Stratification Rules
- Default strata are entropy bins, boundary bins, or task-specific dominant-group partitions.
- Sensitivity sweeps use fixed alternative stratifications defined from cached prediction vectors only.
- Stratification maps are not tuned per wrapper and do not depend on calibration/test responses.
## Main Metrics
- Marginal coverage.
- Max disparity across prediction-space strata.
- Worst-stratum coverage.
- Coverage variance.
- SSCV and mean radius; low-dimensional synthetic tasks also report a simplex-volume ratio.
## Wrapper Families
- Global split conformal.
- Group-wise / Mondrian conformal.
- Two-stage normalization.
- Exact or leave-one-out references where affordable.
- Diagnostic variants such as OneShot, TrainRes, and the current weighted implementation.
## Output Interpretation
The benchmark is designed to compare wrapper families under visible allocation-efficiency-compute tradeoffs. It should not be reduced to a single leaderboard or read as a conditional-coverage certification protocol.
"""
).strip()
write(DOCS_DIR / "benchmark_card.md", benchmark_card)
write(DOCS_DIR / "evaluation_protocol.md", evaluation_protocol)
def main() -> None:
build_task_cards()
build_docs()
print("Built SimplexTasks-12 task cards and docs.")
if __name__ == "__main__":
main()