simplexuq-code / scripts /build_simplextasks_docs.py

Initial anonymous code release

fc329a3 verified 25 days ago

13.9 kB

	"""Build task cards and benchmark docs for the SimplexTasks-12 release."""

	from __future__ import annotations

	import json
	from pathlib import Path
	from textwrap import dedent

	import yaml


	REPO_ROOT = Path(__file__).resolve().parents[1]
	RELEASE_ROOT = REPO_ROOT / "release" / "simplextasks-12"
	DOCS_DIR = RELEASE_ROOT / "docs"

	REAL_EXTRAS = {
	"cifar10_softmax": {
	"evaluation_role": "Extreme classification-style stress test for allocation failure under fixed class-probability vectors.",
	"target_definition": "A 10-way one-hot class distribution paired with a frozen ResNet-18 softmax cache.",
	"default_score": "Total variation / L1 on the simplex.",
	"default_stratification": "Entropy bins of the softmax prediction.",
	"limitations": "This is a classification proxy rather than a naturally continuous composition task. The release ships derived arrays only and does not mirror raw CIFAR-10 images.",
	},
	"topics_20ng": {
	"evaluation_role": "Smooth-heterogeneity real task for topic-mixture prediction.",
	"target_definition": "Fixed 10-topic compositions constructed once from the 20 Newsgroups corpus, with a TF-IDF-to-topic-mixture kNN regressor as the predictor.",
	"default_score": "Aitchison distance.",
	"default_stratification": "Entropy bins of the predicted topic mixture.",
	"limitations": "Both the target and the predictor output are model-derived topic mixtures. The release exposes derived simplex arrays, not the raw text corpus.",
	},
	"samson_unmixing": {
	"evaluation_role": "Natural low-dimensional compositional benchmark aligned with grouped repair.",
	"target_definition": "Three-endmember abundance compositions paired with a frozen NMF abundance estimator.",
	"default_score": "Aitchison distance.",
	"default_stratification": "Boundary bins on the abundance prediction.",
	"limitations": "The benchmark files are public and source-cited, but the upstream bundle did not include an explicit license file; downstream reuse should preserve attribution and the original benchmark context.",
	},
	"pbmc3k_pseudobulk": {
	"evaluation_role": "Semi-synthetic control with known composition targets.",
	"target_definition": "Pseudobulk cell-type fractions generated from PBMC3K together with a fixed NNLS deconvolution predictor.",
	"default_score": "Aitchison distance.",
	"default_stratification": "Boundary bins on the predicted cell-type fractions.",
	"limitations": "This is a semi-synthetic control rather than a deployment-ready deconvolution benchmark. The grouped-repair story is sensitive to the pseudobulk concentration setting, which is documented in the paper appendix.",
	},
	"utkface_age_ldl": {
	"evaluation_role": "Image-based label-distribution task with strong grouped heterogeneity.",
	"target_definition": "A 10-bin age label distribution constructed from UTKFace ages and predicted from thumbnail image features using PCA+kNN regression.",
	"default_score": "Aitchison distance.",
	"default_stratification": "Entropy bins of the predicted age distribution.",
	"limitations": "The underlying face images are not redistributed and remain subject to the source dataset's research-use constraints. The benchmark ships derived features and simplex arrays only.",
	},
	"affectivetext_emotions": {
	"evaluation_role": "Small-sample weak-structure counterexample with a frozen language-model predictor.",
	"target_definition": "Normalized six-emotion gold scores from SemEval-2007 AffectiveText paired with a frozen zero-shot scorer cache.",
	"default_score": "Aitchison distance.",
	"default_stratification": "Boundary bins on the predicted emotion mixture.",
	"limitations": "The reported benchmark tables use a frozen closed-model cache; the release omits raw headlines and raw API responses. A fully open TF-IDF+SVD+kNN fallback cache-builder is included for local reproduction, but it is not identical to the main predictor used in the paper tables.",
	},
	}

	SYNTH_EXTRAS = {
	"d1_homogeneous": {
	"evaluation_role": "Negative control with no residual-scale heterogeneity.",
	"limitations": "Intended to confirm that the benchmark does not manufacture allocation failures when the DGP is homogeneous.",
	},
	"d2_pure_scale": {
	"evaluation_role": "Canonical smooth-scale heterogeneity regime used to motivate normalization-based repair.",
	"limitations": "This regime isolates scale variation only; it is not intended to cover bias-type misspecification or changing tail shape.",
	},
	"d3_discrete_groups_aligned": {
	"evaluation_role": "Aligned discrete-group regime where group-wise calibration should have an inductive advantage.",
	"limitations": "The grouping is built into the DGP. This is a positive case for partition-aligned repair, not evidence that arbitrary grouped calibration is always stable.",
	},
	"d4_model_bias": {
	"evaluation_role": "Bias-type counterexample where changing the wrapper alone is not enough.",
	"limitations": "This regime is not reducible to a single local scale field, so normalization cannot be expected to fully repair it.",
	},
	"d5_heavy_tail": {
	"evaluation_role": "Smooth-scale regime with heavy tails to stress robustness beyond Gaussian residuals.",
	"limitations": "The regime is still synthetic and targeted; it does not span every possible deviation from the baseline score distribution.",
	},
	"d6_high_k": {
	"evaluation_role": "High-dimensional simplex regime exposing the compute and allocation challenges that appear when K is large.",
	"limitations": "Benchmark-scale exact methods are too expensive here; the paper marks the exact-reference entries separately as D6 dagger cells.",
	},
	}


	def load_json(path: Path) -> dict:
	with open(path) as f:
	return json.load(f)


	def load_yaml(path: Path) -> dict:
	with open(path) as f:
	return yaml.safe_load(f)


	def write(path: Path, text: str) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text(text.rstrip() + "\n")


	def bullet_list(items: list[str]) -> str:
	return "\n".join(f"- {item}" for item in items)


	def synthetic_task_card(task_dir: Path, metadata: dict, config: dict, extra: dict) -> str:
	evaluation = config["evaluation"]
	dgp = config["dgp"]
	data = config["data"]
	lines = [
	f"# {metadata['task_name']} Task Card",
	"",
	f"- Task ID: `{metadata['task_id']}`",
	"- Subset: synthetic",
	f"- Samples: `{metadata['n_samples']}`",
	f"- Simplex dimension: `{metadata['simplex_dim']}`",
	f"- Predictor: {metadata['predictor']}",
	f"- Regime label: {metadata['regime_label']}",
	"",
	"## Evaluation Role",
	"",
	extra["evaluation_role"],
	"",
	"## DGP Summary",
	"",
	f"- DGP family: `{dgp['name']}`",
	"- Default score: Aitchison distance",
	f"- Default stratification: `{evaluation['strata_method']}` with `{evaluation['n_strata']}` strata",
	f"- Calibration size: `{data['n_cal']}`",
	f"- Test size: `{data['n_test']}`",
	f"- Repetitions: `{data['n_rep']}`",
	"",
	"## Release Contents",
	"",
	bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
	"",
	"## Provenance And Rebuild",
	"",
	f"- Source asset: {metadata['source_asset']}",
	f"- Config file: `{metadata['config_file']}`",
	f"- Redistribution: `{metadata['redistribution']}`",
	f"- Seed: `{metadata['seed']}`",
	"",
	"## Limitations",
	"",
	extra["limitations"],
	]
	return "\n".join(lines)


	def real_task_card(task_dir: Path, metadata: dict, extra: dict) -> str:
	lines = [
	f"# {metadata['task_name']} Task Card",
	"",
	f"- Task ID: `{metadata['task_id']}`",
	"- Subset: real",
	f"- Samples: `{metadata['n_samples']}`",
	f"- Simplex dimension: `{metadata['simplex_dim']}`",
	f"- Predictor: {metadata['predictor']}",
	"",
	"## Evaluation Role",
	"",
	extra["evaluation_role"],
	"",
	"## Target And Predictor",
	"",
	extra["target_definition"],
	"",
	"## Default Benchmark Settings",
	"",
	f"- Default score: {extra['default_score']}",
	f"- Default stratification: {extra['default_stratification']}",
	f"- Redistribution: `{metadata['redistribution']}`",
	"",
	"## Release Contents",
	"",
	bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
	"",
	"## Provenance And Usage Notes",
	"",
	f"- Source asset: {metadata['source_asset']}",
	f"- Metadata note: {metadata['notes']}",
	"",
	"## Limitations",
	"",
	extra["limitations"],
	]
	return "\n".join(lines)


	def build_task_cards() -> None:
	for task_dir in sorted((RELEASE_ROOT / "real").glob("*")):
	metadata = load_json(task_dir / "metadata.json")
	extra = REAL_EXTRAS[metadata["task_id"]]
	write(task_dir / "task_card.md", real_task_card(task_dir, metadata, extra))

	for task_dir in sorted((RELEASE_ROOT / "synthetic").glob("*")):
	metadata = load_json(task_dir / "metadata.json")
	config = load_yaml(task_dir / "config.yaml")
	extra = SYNTH_EXTRAS[metadata["task_id"]]
	write(task_dir / "task_card.md", synthetic_task_card(task_dir, metadata, config, extra))


	def build_docs() -> None:
	benchmark_card = dedent(
	"""
	# SimplexTasks-12 Benchmark Card

	SimplexTasks-12 is the processed task collection behind the SimplexUQ benchmark. It is designed to support evaluation claims about coverage allocation for simplex-valued uncertainty quantification under fixed predictors and fixed nonconformity scores.

	## Supported Claims

	- Whether a fixed simplex-valued predictor exhibits allocation failure beyond its marginal coverage.
	- Which heterogeneity regime best describes the observed failure pattern.
	- Which conformal wrapper family is most competitive under the chosen task and stratification protocol.

	## Claims The Benchmark Does Not Support

	- Universal wrapper rankings across all simplex tasks.
	- Deployment-readiness claims for any predictor.
	- Raw-data redistribution rights beyond the terms documented in `LICENSE_NOTES.md`.

	## Benchmark Contents

	- 6 synthetic regimes with canonical `task.npz` bundles and copied YAML configs.
	- 6 fixed-predictor real tasks with cached `(Y, U)` arrays or derived features.
	- Per-task `task_card.md` files and `metadata.json` provenance records.
	- Release-level rebuild instructions for the paper tables and figures.

	## Reproducibility Contract

	- Benchmark evaluation always operates on frozen predictor outputs.
	- Default stratification rules are fixed before wrapper comparison.
	- Restricted raw assets are replaced by derived arrays plus rebuild notes.
	- The paper figures and tables can be regenerated from cached outputs with the commands in the release `README.md`.

	## Responsible Use

	Some tasks rely on non-redistributable source assets or derived features from sensitive domains such as face images. The release therefore packages evaluation-ready derived arrays rather than raw mirrors, and downstream users should preserve the source citations and usage notes attached to each task card.
	"""
	).strip()

	evaluation_protocol = dedent(
	"""
	# SimplexTasks-12 Evaluation Protocol

	## Fixed-Predictor Principle

	Every benchmark slice fixes the predictor first and then varies only the conformal wrapper. This keeps the evaluation target on uncertainty allocation rather than on predictor training.

	## Default Scores

	- Real and synthetic composition tasks use Aitchison distance unless the target lies on simplex vertices.
	- CIFAR-10 uses total variation / L1 because Aitchison distance is ill-defined at the one-hot boundary.

	## Stratification Rules

	- Default strata are entropy bins, boundary bins, or task-specific dominant-group partitions.
	- Sensitivity sweeps use fixed alternative stratifications defined from cached prediction vectors only.
	- Stratification maps are not tuned per wrapper and do not depend on calibration/test responses.

	## Main Metrics

	- Marginal coverage.
	- Max disparity across prediction-space strata.
	- Worst-stratum coverage.
	- Coverage variance.
	- SSCV and mean radius; low-dimensional synthetic tasks also report a simplex-volume ratio.

	## Wrapper Families

	- Global split conformal.
	- Group-wise / Mondrian conformal.
	- Two-stage normalization.
	- Exact or leave-one-out references where affordable.
	- Diagnostic variants such as OneShot, TrainRes, and the current weighted implementation.

	## Output Interpretation

	The benchmark is designed to compare wrapper families under visible allocation-efficiency-compute tradeoffs. It should not be reduced to a single leaderboard or read as a conditional-coverage certification protocol.
	"""
	).strip()

	write(DOCS_DIR / "benchmark_card.md", benchmark_card)
	write(DOCS_DIR / "evaluation_protocol.md", evaluation_protocol)


	def main() -> None:
	build_task_cards()
	build_docs()
	print("Built SimplexTasks-12 task cards and docs.")


	if __name__ == "__main__":
	main()