File size: 13,945 Bytes

fc329a3

"""Build task cards and benchmark docs for the SimplexTasks-12 release."""

from __future__ import annotations

import json
from pathlib import Path
from textwrap import dedent

import yaml


REPO_ROOT = Path(__file__).resolve().parents[1]
RELEASE_ROOT = REPO_ROOT / "release" / "simplextasks-12"
DOCS_DIR = RELEASE_ROOT / "docs"

REAL_EXTRAS = {
    "cifar10_softmax": {
        "evaluation_role": "Extreme classification-style stress test for allocation failure under fixed class-probability vectors.",
        "target_definition": "A 10-way one-hot class distribution paired with a frozen ResNet-18 softmax cache.",
        "default_score": "Total variation / L1 on the simplex.",
        "default_stratification": "Entropy bins of the softmax prediction.",
        "limitations": "This is a classification proxy rather than a naturally continuous composition task. The release ships derived arrays only and does not mirror raw CIFAR-10 images.",
    },
    "topics_20ng": {
        "evaluation_role": "Smooth-heterogeneity real task for topic-mixture prediction.",
        "target_definition": "Fixed 10-topic compositions constructed once from the 20 Newsgroups corpus, with a TF-IDF-to-topic-mixture kNN regressor as the predictor.",
        "default_score": "Aitchison distance.",
        "default_stratification": "Entropy bins of the predicted topic mixture.",
        "limitations": "Both the target and the predictor output are model-derived topic mixtures. The release exposes derived simplex arrays, not the raw text corpus.",
    },
    "samson_unmixing": {
        "evaluation_role": "Natural low-dimensional compositional benchmark aligned with grouped repair.",
        "target_definition": "Three-endmember abundance compositions paired with a frozen NMF abundance estimator.",
        "default_score": "Aitchison distance.",
        "default_stratification": "Boundary bins on the abundance prediction.",
        "limitations": "The benchmark files are public and source-cited, but the upstream bundle did not include an explicit license file; downstream reuse should preserve attribution and the original benchmark context.",
    },
    "pbmc3k_pseudobulk": {
        "evaluation_role": "Semi-synthetic control with known composition targets.",
        "target_definition": "Pseudobulk cell-type fractions generated from PBMC3K together with a fixed NNLS deconvolution predictor.",
        "default_score": "Aitchison distance.",
        "default_stratification": "Boundary bins on the predicted cell-type fractions.",
        "limitations": "This is a semi-synthetic control rather than a deployment-ready deconvolution benchmark. The grouped-repair story is sensitive to the pseudobulk concentration setting, which is documented in the paper appendix.",
    },
    "utkface_age_ldl": {
        "evaluation_role": "Image-based label-distribution task with strong grouped heterogeneity.",
        "target_definition": "A 10-bin age label distribution constructed from UTKFace ages and predicted from thumbnail image features using PCA+kNN regression.",
        "default_score": "Aitchison distance.",
        "default_stratification": "Entropy bins of the predicted age distribution.",
        "limitations": "The underlying face images are not redistributed and remain subject to the source dataset's research-use constraints. The benchmark ships derived features and simplex arrays only.",
    },
    "affectivetext_emotions": {
        "evaluation_role": "Small-sample weak-structure counterexample with a frozen language-model predictor.",
        "target_definition": "Normalized six-emotion gold scores from SemEval-2007 AffectiveText paired with a frozen zero-shot scorer cache.",
        "default_score": "Aitchison distance.",
        "default_stratification": "Boundary bins on the predicted emotion mixture.",
        "limitations": "The reported benchmark tables use a frozen closed-model cache; the release omits raw headlines and raw API responses. A fully open TF-IDF+SVD+kNN fallback cache-builder is included for local reproduction, but it is not identical to the main predictor used in the paper tables.",
    },
}

SYNTH_EXTRAS = {
    "d1_homogeneous": {
        "evaluation_role": "Negative control with no residual-scale heterogeneity.",
        "limitations": "Intended to confirm that the benchmark does not manufacture allocation failures when the DGP is homogeneous.",
    },
    "d2_pure_scale": {
        "evaluation_role": "Canonical smooth-scale heterogeneity regime used to motivate normalization-based repair.",
        "limitations": "This regime isolates scale variation only; it is not intended to cover bias-type misspecification or changing tail shape.",
    },
    "d3_discrete_groups_aligned": {
        "evaluation_role": "Aligned discrete-group regime where group-wise calibration should have an inductive advantage.",
        "limitations": "The grouping is built into the DGP. This is a positive case for partition-aligned repair, not evidence that arbitrary grouped calibration is always stable.",
    },
    "d4_model_bias": {
        "evaluation_role": "Bias-type counterexample where changing the wrapper alone is not enough.",
        "limitations": "This regime is not reducible to a single local scale field, so normalization cannot be expected to fully repair it.",
    },
    "d5_heavy_tail": {
        "evaluation_role": "Smooth-scale regime with heavy tails to stress robustness beyond Gaussian residuals.",
        "limitations": "The regime is still synthetic and targeted; it does not span every possible deviation from the baseline score distribution.",
    },
    "d6_high_k": {
        "evaluation_role": "High-dimensional simplex regime exposing the compute and allocation challenges that appear when K is large.",
        "limitations": "Benchmark-scale exact methods are too expensive here; the paper marks the exact-reference entries separately as D6 dagger cells.",
    },
}


def load_json(path: Path) -> dict:
    with open(path) as f:
        return json.load(f)


def load_yaml(path: Path) -> dict:
    with open(path) as f:
        return yaml.safe_load(f)


def write(path: Path, text: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(text.rstrip() + "\n")


def bullet_list(items: list[str]) -> str:
    return "\n".join(f"- {item}" for item in items)


def synthetic_task_card(task_dir: Path, metadata: dict, config: dict, extra: dict) -> str:
    evaluation = config["evaluation"]
    dgp = config["dgp"]
    data = config["data"]
    lines = [
        f"# {metadata['task_name']} Task Card",
        "",
        f"- Task ID: `{metadata['task_id']}`",
        "- Subset: synthetic",
        f"- Samples: `{metadata['n_samples']}`",
        f"- Simplex dimension: `{metadata['simplex_dim']}`",
        f"- Predictor: {metadata['predictor']}",
        f"- Regime label: {metadata['regime_label']}",
        "",
        "## Evaluation Role",
        "",
        extra["evaluation_role"],
        "",
        "## DGP Summary",
        "",
        f"- DGP family: `{dgp['name']}`",
        "- Default score: Aitchison distance",
        f"- Default stratification: `{evaluation['strata_method']}` with `{evaluation['n_strata']}` strata",
        f"- Calibration size: `{data['n_cal']}`",
        f"- Test size: `{data['n_test']}`",
        f"- Repetitions: `{data['n_rep']}`",
        "",
        "## Release Contents",
        "",
        bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
        "",
        "## Provenance And Rebuild",
        "",
        f"- Source asset: {metadata['source_asset']}",
        f"- Config file: `{metadata['config_file']}`",
        f"- Redistribution: `{metadata['redistribution']}`",
        f"- Seed: `{metadata['seed']}`",
        "",
        "## Limitations",
        "",
        extra["limitations"],
    ]
    return "\n".join(lines)


def real_task_card(task_dir: Path, metadata: dict, extra: dict) -> str:
    lines = [
        f"# {metadata['task_name']} Task Card",
        "",
        f"- Task ID: `{metadata['task_id']}`",
        "- Subset: real",
        f"- Samples: `{metadata['n_samples']}`",
        f"- Simplex dimension: `{metadata['simplex_dim']}`",
        f"- Predictor: {metadata['predictor']}",
        "",
        "## Evaluation Role",
        "",
        extra["evaluation_role"],
        "",
        "## Target And Predictor",
        "",
        extra["target_definition"],
        "",
        "## Default Benchmark Settings",
        "",
        f"- Default score: {extra['default_score']}",
        f"- Default stratification: {extra['default_stratification']}",
        f"- Redistribution: `{metadata['redistribution']}`",
        "",
        "## Release Contents",
        "",
        bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
        "",
        "## Provenance And Usage Notes",
        "",
        f"- Source asset: {metadata['source_asset']}",
        f"- Metadata note: {metadata['notes']}",
        "",
        "## Limitations",
        "",
        extra["limitations"],
    ]
    return "\n".join(lines)


def build_task_cards() -> None:
    for task_dir in sorted((RELEASE_ROOT / "real").glob("*")):
        metadata = load_json(task_dir / "metadata.json")
        extra = REAL_EXTRAS[metadata["task_id"]]
        write(task_dir / "task_card.md", real_task_card(task_dir, metadata, extra))

    for task_dir in sorted((RELEASE_ROOT / "synthetic").glob("*")):
        metadata = load_json(task_dir / "metadata.json")
        config = load_yaml(task_dir / "config.yaml")
        extra = SYNTH_EXTRAS[metadata["task_id"]]
        write(task_dir / "task_card.md", synthetic_task_card(task_dir, metadata, config, extra))


def build_docs() -> None:
    benchmark_card = dedent(
        """
        # SimplexTasks-12 Benchmark Card

        SimplexTasks-12 is the processed task collection behind the SimplexUQ benchmark. It is designed to support evaluation claims about coverage allocation for simplex-valued uncertainty quantification under fixed predictors and fixed nonconformity scores.

        ## Supported Claims

        - Whether a fixed simplex-valued predictor exhibits allocation failure beyond its marginal coverage.
        - Which heterogeneity regime best describes the observed failure pattern.
        - Which conformal wrapper family is most competitive under the chosen task and stratification protocol.

        ## Claims The Benchmark Does Not Support

        - Universal wrapper rankings across all simplex tasks.
        - Deployment-readiness claims for any predictor.
        - Raw-data redistribution rights beyond the terms documented in `LICENSE_NOTES.md`.

        ## Benchmark Contents

        - 6 synthetic regimes with canonical `task.npz` bundles and copied YAML configs.
        - 6 fixed-predictor real tasks with cached `(Y, U)` arrays or derived features.
        - Per-task `task_card.md` files and `metadata.json` provenance records.
        - Release-level rebuild instructions for the paper tables and figures.

        ## Reproducibility Contract

        - Benchmark evaluation always operates on frozen predictor outputs.
        - Default stratification rules are fixed before wrapper comparison.
        - Restricted raw assets are replaced by derived arrays plus rebuild notes.
        - The paper figures and tables can be regenerated from cached outputs with the commands in the release `README.md`.

        ## Responsible Use

        Some tasks rely on non-redistributable source assets or derived features from sensitive domains such as face images. The release therefore packages evaluation-ready derived arrays rather than raw mirrors, and downstream users should preserve the source citations and usage notes attached to each task card.
        """
    ).strip()

    evaluation_protocol = dedent(
        """
        # SimplexTasks-12 Evaluation Protocol

        ## Fixed-Predictor Principle

        Every benchmark slice fixes the predictor first and then varies only the conformal wrapper. This keeps the evaluation target on uncertainty allocation rather than on predictor training.

        ## Default Scores

        - Real and synthetic composition tasks use Aitchison distance unless the target lies on simplex vertices.
        - CIFAR-10 uses total variation / L1 because Aitchison distance is ill-defined at the one-hot boundary.

        ## Stratification Rules

        - Default strata are entropy bins, boundary bins, or task-specific dominant-group partitions.
        - Sensitivity sweeps use fixed alternative stratifications defined from cached prediction vectors only.
        - Stratification maps are not tuned per wrapper and do not depend on calibration/test responses.

        ## Main Metrics

        - Marginal coverage.
        - Max disparity across prediction-space strata.
        - Worst-stratum coverage.
        - Coverage variance.
        - SSCV and mean radius; low-dimensional synthetic tasks also report a simplex-volume ratio.

        ## Wrapper Families

        - Global split conformal.
        - Group-wise / Mondrian conformal.
        - Two-stage normalization.
        - Exact or leave-one-out references where affordable.
        - Diagnostic variants such as OneShot, TrainRes, and the current weighted implementation.

        ## Output Interpretation

        The benchmark is designed to compare wrapper families under visible allocation-efficiency-compute tradeoffs. It should not be reduced to a single leaderboard or read as a conditional-coverage certification protocol.
        """
    ).strip()

    write(DOCS_DIR / "benchmark_card.md", benchmark_card)
    write(DOCS_DIR / "evaluation_protocol.md", evaluation_protocol)


def main() -> None:
    build_task_cards()
    build_docs()
    print("Built SimplexTasks-12 task cards and docs.")


if __name__ == "__main__":
    main()