File size: 13,945 Bytes
fc329a3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 | """Build task cards and benchmark docs for the SimplexTasks-12 release."""
from __future__ import annotations
import json
from pathlib import Path
from textwrap import dedent
import yaml
REPO_ROOT = Path(__file__).resolve().parents[1]
RELEASE_ROOT = REPO_ROOT / "release" / "simplextasks-12"
DOCS_DIR = RELEASE_ROOT / "docs"
REAL_EXTRAS = {
"cifar10_softmax": {
"evaluation_role": "Extreme classification-style stress test for allocation failure under fixed class-probability vectors.",
"target_definition": "A 10-way one-hot class distribution paired with a frozen ResNet-18 softmax cache.",
"default_score": "Total variation / L1 on the simplex.",
"default_stratification": "Entropy bins of the softmax prediction.",
"limitations": "This is a classification proxy rather than a naturally continuous composition task. The release ships derived arrays only and does not mirror raw CIFAR-10 images.",
},
"topics_20ng": {
"evaluation_role": "Smooth-heterogeneity real task for topic-mixture prediction.",
"target_definition": "Fixed 10-topic compositions constructed once from the 20 Newsgroups corpus, with a TF-IDF-to-topic-mixture kNN regressor as the predictor.",
"default_score": "Aitchison distance.",
"default_stratification": "Entropy bins of the predicted topic mixture.",
"limitations": "Both the target and the predictor output are model-derived topic mixtures. The release exposes derived simplex arrays, not the raw text corpus.",
},
"samson_unmixing": {
"evaluation_role": "Natural low-dimensional compositional benchmark aligned with grouped repair.",
"target_definition": "Three-endmember abundance compositions paired with a frozen NMF abundance estimator.",
"default_score": "Aitchison distance.",
"default_stratification": "Boundary bins on the abundance prediction.",
"limitations": "The benchmark files are public and source-cited, but the upstream bundle did not include an explicit license file; downstream reuse should preserve attribution and the original benchmark context.",
},
"pbmc3k_pseudobulk": {
"evaluation_role": "Semi-synthetic control with known composition targets.",
"target_definition": "Pseudobulk cell-type fractions generated from PBMC3K together with a fixed NNLS deconvolution predictor.",
"default_score": "Aitchison distance.",
"default_stratification": "Boundary bins on the predicted cell-type fractions.",
"limitations": "This is a semi-synthetic control rather than a deployment-ready deconvolution benchmark. The grouped-repair story is sensitive to the pseudobulk concentration setting, which is documented in the paper appendix.",
},
"utkface_age_ldl": {
"evaluation_role": "Image-based label-distribution task with strong grouped heterogeneity.",
"target_definition": "A 10-bin age label distribution constructed from UTKFace ages and predicted from thumbnail image features using PCA+kNN regression.",
"default_score": "Aitchison distance.",
"default_stratification": "Entropy bins of the predicted age distribution.",
"limitations": "The underlying face images are not redistributed and remain subject to the source dataset's research-use constraints. The benchmark ships derived features and simplex arrays only.",
},
"affectivetext_emotions": {
"evaluation_role": "Small-sample weak-structure counterexample with a frozen language-model predictor.",
"target_definition": "Normalized six-emotion gold scores from SemEval-2007 AffectiveText paired with a frozen zero-shot scorer cache.",
"default_score": "Aitchison distance.",
"default_stratification": "Boundary bins on the predicted emotion mixture.",
"limitations": "The reported benchmark tables use a frozen closed-model cache; the release omits raw headlines and raw API responses. A fully open TF-IDF+SVD+kNN fallback cache-builder is included for local reproduction, but it is not identical to the main predictor used in the paper tables.",
},
}
SYNTH_EXTRAS = {
"d1_homogeneous": {
"evaluation_role": "Negative control with no residual-scale heterogeneity.",
"limitations": "Intended to confirm that the benchmark does not manufacture allocation failures when the DGP is homogeneous.",
},
"d2_pure_scale": {
"evaluation_role": "Canonical smooth-scale heterogeneity regime used to motivate normalization-based repair.",
"limitations": "This regime isolates scale variation only; it is not intended to cover bias-type misspecification or changing tail shape.",
},
"d3_discrete_groups_aligned": {
"evaluation_role": "Aligned discrete-group regime where group-wise calibration should have an inductive advantage.",
"limitations": "The grouping is built into the DGP. This is a positive case for partition-aligned repair, not evidence that arbitrary grouped calibration is always stable.",
},
"d4_model_bias": {
"evaluation_role": "Bias-type counterexample where changing the wrapper alone is not enough.",
"limitations": "This regime is not reducible to a single local scale field, so normalization cannot be expected to fully repair it.",
},
"d5_heavy_tail": {
"evaluation_role": "Smooth-scale regime with heavy tails to stress robustness beyond Gaussian residuals.",
"limitations": "The regime is still synthetic and targeted; it does not span every possible deviation from the baseline score distribution.",
},
"d6_high_k": {
"evaluation_role": "High-dimensional simplex regime exposing the compute and allocation challenges that appear when K is large.",
"limitations": "Benchmark-scale exact methods are too expensive here; the paper marks the exact-reference entries separately as D6 dagger cells.",
},
}
def load_json(path: Path) -> dict:
with open(path) as f:
return json.load(f)
def load_yaml(path: Path) -> dict:
with open(path) as f:
return yaml.safe_load(f)
def write(path: Path, text: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(text.rstrip() + "\n")
def bullet_list(items: list[str]) -> str:
return "\n".join(f"- {item}" for item in items)
def synthetic_task_card(task_dir: Path, metadata: dict, config: dict, extra: dict) -> str:
evaluation = config["evaluation"]
dgp = config["dgp"]
data = config["data"]
lines = [
f"# {metadata['task_name']} Task Card",
"",
f"- Task ID: `{metadata['task_id']}`",
"- Subset: synthetic",
f"- Samples: `{metadata['n_samples']}`",
f"- Simplex dimension: `{metadata['simplex_dim']}`",
f"- Predictor: {metadata['predictor']}",
f"- Regime label: {metadata['regime_label']}",
"",
"## Evaluation Role",
"",
extra["evaluation_role"],
"",
"## DGP Summary",
"",
f"- DGP family: `{dgp['name']}`",
"- Default score: Aitchison distance",
f"- Default stratification: `{evaluation['strata_method']}` with `{evaluation['n_strata']}` strata",
f"- Calibration size: `{data['n_cal']}`",
f"- Test size: `{data['n_test']}`",
f"- Repetitions: `{data['n_rep']}`",
"",
"## Release Contents",
"",
bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
"",
"## Provenance And Rebuild",
"",
f"- Source asset: {metadata['source_asset']}",
f"- Config file: `{metadata['config_file']}`",
f"- Redistribution: `{metadata['redistribution']}`",
f"- Seed: `{metadata['seed']}`",
"",
"## Limitations",
"",
extra["limitations"],
]
return "\n".join(lines)
def real_task_card(task_dir: Path, metadata: dict, extra: dict) -> str:
lines = [
f"# {metadata['task_name']} Task Card",
"",
f"- Task ID: `{metadata['task_id']}`",
"- Subset: real",
f"- Samples: `{metadata['n_samples']}`",
f"- Simplex dimension: `{metadata['simplex_dim']}`",
f"- Predictor: {metadata['predictor']}",
"",
"## Evaluation Role",
"",
extra["evaluation_role"],
"",
"## Target And Predictor",
"",
extra["target_definition"],
"",
"## Default Benchmark Settings",
"",
f"- Default score: {extra['default_score']}",
f"- Default stratification: {extra['default_stratification']}",
f"- Redistribution: `{metadata['redistribution']}`",
"",
"## Release Contents",
"",
bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
"",
"## Provenance And Usage Notes",
"",
f"- Source asset: {metadata['source_asset']}",
f"- Metadata note: {metadata['notes']}",
"",
"## Limitations",
"",
extra["limitations"],
]
return "\n".join(lines)
def build_task_cards() -> None:
for task_dir in sorted((RELEASE_ROOT / "real").glob("*")):
metadata = load_json(task_dir / "metadata.json")
extra = REAL_EXTRAS[metadata["task_id"]]
write(task_dir / "task_card.md", real_task_card(task_dir, metadata, extra))
for task_dir in sorted((RELEASE_ROOT / "synthetic").glob("*")):
metadata = load_json(task_dir / "metadata.json")
config = load_yaml(task_dir / "config.yaml")
extra = SYNTH_EXTRAS[metadata["task_id"]]
write(task_dir / "task_card.md", synthetic_task_card(task_dir, metadata, config, extra))
def build_docs() -> None:
benchmark_card = dedent(
"""
# SimplexTasks-12 Benchmark Card
SimplexTasks-12 is the processed task collection behind the SimplexUQ benchmark. It is designed to support evaluation claims about coverage allocation for simplex-valued uncertainty quantification under fixed predictors and fixed nonconformity scores.
## Supported Claims
- Whether a fixed simplex-valued predictor exhibits allocation failure beyond its marginal coverage.
- Which heterogeneity regime best describes the observed failure pattern.
- Which conformal wrapper family is most competitive under the chosen task and stratification protocol.
## Claims The Benchmark Does Not Support
- Universal wrapper rankings across all simplex tasks.
- Deployment-readiness claims for any predictor.
- Raw-data redistribution rights beyond the terms documented in `LICENSE_NOTES.md`.
## Benchmark Contents
- 6 synthetic regimes with canonical `task.npz` bundles and copied YAML configs.
- 6 fixed-predictor real tasks with cached `(Y, U)` arrays or derived features.
- Per-task `task_card.md` files and `metadata.json` provenance records.
- Release-level rebuild instructions for the paper tables and figures.
## Reproducibility Contract
- Benchmark evaluation always operates on frozen predictor outputs.
- Default stratification rules are fixed before wrapper comparison.
- Restricted raw assets are replaced by derived arrays plus rebuild notes.
- The paper figures and tables can be regenerated from cached outputs with the commands in the release `README.md`.
## Responsible Use
Some tasks rely on non-redistributable source assets or derived features from sensitive domains such as face images. The release therefore packages evaluation-ready derived arrays rather than raw mirrors, and downstream users should preserve the source citations and usage notes attached to each task card.
"""
).strip()
evaluation_protocol = dedent(
"""
# SimplexTasks-12 Evaluation Protocol
## Fixed-Predictor Principle
Every benchmark slice fixes the predictor first and then varies only the conformal wrapper. This keeps the evaluation target on uncertainty allocation rather than on predictor training.
## Default Scores
- Real and synthetic composition tasks use Aitchison distance unless the target lies on simplex vertices.
- CIFAR-10 uses total variation / L1 because Aitchison distance is ill-defined at the one-hot boundary.
## Stratification Rules
- Default strata are entropy bins, boundary bins, or task-specific dominant-group partitions.
- Sensitivity sweeps use fixed alternative stratifications defined from cached prediction vectors only.
- Stratification maps are not tuned per wrapper and do not depend on calibration/test responses.
## Main Metrics
- Marginal coverage.
- Max disparity across prediction-space strata.
- Worst-stratum coverage.
- Coverage variance.
- SSCV and mean radius; low-dimensional synthetic tasks also report a simplex-volume ratio.
## Wrapper Families
- Global split conformal.
- Group-wise / Mondrian conformal.
- Two-stage normalization.
- Exact or leave-one-out references where affordable.
- Diagnostic variants such as OneShot, TrainRes, and the current weighted implementation.
## Output Interpretation
The benchmark is designed to compare wrapper families under visible allocation-efficiency-compute tradeoffs. It should not be reduced to a single leaderboard or read as a conditional-coverage certification protocol.
"""
).strip()
write(DOCS_DIR / "benchmark_card.md", benchmark_card)
write(DOCS_DIR / "evaluation_protocol.md", evaluation_protocol)
def main() -> None:
build_task_cards()
build_docs()
print("Built SimplexTasks-12 task cards and docs.")
if __name__ == "__main__":
main()
|