anonymous0523ly commited on May 3

Commit

fc329a3

verified ·

1 Parent(s): edfea8c

Initial anonymous code release

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

CITATION.cff +14 -0
LICENSE +6 -0
README.md +20 -0
configs/methods/fullcp.yaml +4 -0
configs/methods/global.yaml +4 -0
configs/methods/jackknife_plus.yaml +4 -0
configs/methods/oneshot.yaml +4 -0
configs/methods/partition.yaml +4 -0
configs/methods/trainres.yaml +5 -0
configs/methods/twostage.yaml +4 -0
configs/methods/weighted.yaml +4 -0
configs/real/affectivetext.yaml +6 -0
configs/real/cifar10_softmax.yaml +6 -0
configs/real/pbmc_pseudobulk.yaml +6 -0
configs/real/samson_unmixing.yaml +6 -0
configs/real/topics_20newsgroups.yaml +6 -0
configs/real/utkface_ldl.yaml +6 -0
configs/synthetic/D1.yaml +35 -0
configs/synthetic/D2.yaml +37 -0
configs/synthetic/D3.yaml +36 -0
configs/synthetic/D4.yaml +39 -0
configs/synthetic/D5.yaml +36 -0
configs/synthetic/D6.yaml +33 -0
docs/faq.md +14 -0
docs/release_contract.md +7 -0
docs/restricted_assets.md +12 -0
docs/reviewer_quickstart.md +19 -0
docs/task_limitations.md +7 -0
environment.yml +14 -0
outputs/README.md +1 -0
pyproject.toml +28 -0
rebuild/affectivetext/README.md +3 -0
rebuild/affectivetext/cache_affective_text_open_predictions.py +179 -0
rebuild/affectivetext/rebuild_gold_labels.py +6 -0
rebuild/affectivetext/validate_cache_schema.py +11 -0
rebuild/cifar10/README.md +3 -0
rebuild/cifar10/rebuild_from_torchvision.py +3 -0
rebuild/pbmc/README.md +3 -0
rebuild/pbmc/generate_pseudobulk.py +3 -0
rebuild/pbmc/rebuild_from_pbmc3k.py +3 -0
rebuild/samson/README.md +3 -0
rebuild/samson/rebuild_from_public_bundle.py +3 -0
rebuild/topics/README.md +3 -0
rebuild/topics/rebuild_from_sklearn_fetcher.py +3 -0
rebuild/utkface/README.md +3 -0
rebuild/utkface/rebuild_from_utkface.py +3 -0
requirements.txt +8 -0
scripts/build_simplextasks_docs.py +297 -0
scripts/cache_affective_text_open_predictions.py +179 -0
scripts/cache_affective_text_predictions.py +209 -0

CITATION.cff ADDED Viewed

	@@ -0,0 +1,14 @@

+cff-version: 1.2.0
+title: "SimplexUQ code artifact"
+message: "If you use this benchmark code, please cite the accompanying benchmark paper."
+type: software
+version: 0.1.0
+authors:
+  - family-names: "Authors"
+    given-names: "Anonymous"
+abstract: "Executable code artifact for reproducing the SimplexUQ benchmark figures and tables from frozen derived arrays."
+keywords:
+  - simplex
+  - conformal prediction
+  - benchmark
+license: "other"

LICENSE ADDED Viewed

	@@ -0,0 +1,6 @@

+# Code Artifact License Notice
+This anonymous code bundle is provided for NeurIPS E&D review and benchmark
+reproduction. It does not grant rights to redistribute restricted source
+datasets or raw API outputs. Use the code together with the provenance and
+restricted-asset notes shipped in the paired SimplexTasks-12 data bundle.

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# simplexuq-code
+Anonymous code bundle for the SimplexUQ benchmark.
+This repository is meant to be paired with the `SimplexTasks-12-data` dataset
+artifact. It contains:
+- `src/` benchmark logic and utility code
+- `scripts/` benchmark runners and figure/table reproducers
+- `rebuild/` task-specific rebuild notes for restricted assets
+- `configs/` synthetic, real, and method configuration files
+- `docs/` reviewer-facing quickstart and release notes
+Typical usage:
+```bash
+python scripts/check_artifact_integrity.py
+python scripts/reproduce_tables.py
+python scripts/reproduce_figures.py
+```

configs/methods/fullcp.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+method: fullcp
+family: exact / local-scale reference
+paper_label: FullCP
+validity: Exact marginal, expensive in large settings

configs/methods/global.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+method: global
+family: split
+paper_label: Global
+validity: Exact marginal under exchangeability

configs/methods/jackknife_plus.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+method: jackknife_plus
+family: leave-one-out reference
+paper_label: Jackknife+
+validity: Approximate or exact depending on setting; used as a reference in this benchmark

configs/methods/oneshot.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+method: oneshot
+family: diagnostic normalization
+paper_label: OneShot
+validity: No general exchangeability guarantee

configs/methods/partition.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+method: partition
+family: group-wise
+paper_label: Mondrian
+validity: Exact within fixed groups

configs/methods/trainres.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+method: trainres
+family: training-residual normalization
+paper_label: TrainRes
+validity: Can retain marginal validity under strong conditions but may misallocate
+  badly

configs/methods/twostage.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+method: twostage
+family: normalized split
+paper_label: TwoStage
+validity: Exact marginal when the scale fit is independent

configs/methods/weighted.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+method: weighted
+family: weighted conformal diagnostic
+paper_label: Weighted
+validity: Implementation-specific diagnostic only in this benchmark

configs/real/affectivetext.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+task_id: affectivetext_emotions
+default_score: aitchison
+default_stratification: boundary
+alpha: 0.1
+repetitions: 200
+benchmark_mode: fixed_predictor

configs/real/cifar10_softmax.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+task_id: cifar10_softmax
+default_score: tv
+default_stratification: entropy
+alpha: 0.1
+repetitions: 50
+benchmark_mode: fixed_predictor

configs/real/pbmc_pseudobulk.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+task_id: pbmc3k_pseudobulk
+default_score: aitchison
+default_stratification: boundary
+alpha: 0.1
+repetitions: 200
+benchmark_mode: fixed_predictor

configs/real/samson_unmixing.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+task_id: samson_unmixing
+default_score: aitchison
+default_stratification: boundary
+alpha: 0.1
+repetitions: 50
+benchmark_mode: fixed_predictor

configs/real/topics_20newsgroups.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+task_id: topics_20ng
+default_score: aitchison
+default_stratification: entropy
+alpha: 0.1
+repetitions: 50
+benchmark_mode: fixed_predictor

configs/real/utkface_ldl.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+task_id: utkface_age_ldl
+default_score: aitchison
+default_stratification: entropy
+alpha: 0.1
+repetitions: 50
+benchmark_mode: fixed_predictor

configs/synthetic/D1.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+experiment: d1_homogeneous
+dgp:
+  name: pure_scale
+  K: 3
+  sigma_min: 0.2
+  c: 0.0
+  d_x: 2
+data:
+  n_train: 1000
+  n_cal: 1000
+  n_scale_est: 500
+  n_test: 5000
+  n_rep: 200
+methods:
+  - global
+  - partition
+  - twostage
+  - oneshot
+  - trainres
+  - weighted
+  - oracle
+evaluation:
+  alpha: 0.1
+  strata_method: boundary
+  n_strata: 5
+weighting:
+  mode: inverse_sigma
+  source: knn_loo
+seed: 2026

configs/synthetic/D2.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+experiment: d2_pure_scale
+dgp:
+  name: pure_scale
+  K: 3
+  sigma_min: 0.1
+  c: 0.5
+  d_x: 2
+data:
+  n_train: 500
+  n_cal: 500
+  n_scale_est: 250
+  n_test: 5000
+  n_rep: 200
+methods:
+  - global
+  - fullcp
+  - jackknife_plus
+  - partition
+  - twostage
+  - oneshot
+  - trainres
+  - weighted
+  - oracle
+evaluation:
+  alpha: 0.1
+  strata_method: boundary
+  n_strata: 5
+weighting:
+  mode: inverse_sigma
+  source: knn_loo
+seed: 2026

configs/synthetic/D3.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+experiment: d3_discrete_groups_aligned
+dgp:
+  name: discrete_groups
+  K: 10
+  sigma_low: 0.08
+  sigma_high: 0.30
+  d_x: 5
+  easy_classes: 5
+data:
+  n_train: 500
+  n_cal: 500
+  n_scale_est: 250
+  n_test: 5000
+  n_rep: 200
+methods:
+  - global
+  - partition
+  - twostage
+  - fullcp
+  - jackknife_plus
+  - oracle
+evaluation:
+  alpha: 0.1
+  strata_method: argmax_group
+  n_strata: 2
+  split_index: 5
+weighting:
+  mode: inverse_sigma
+  source: knn_loo
+seed: 2026

configs/synthetic/D4.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+experiment: d4_model_bias
+dgp:
+  name: model_bias
+  K: 3
+  sigma_min: 0.1
+  c: 0.15
+  d_x: 2
+  bias_scale: 0.45
+  bias_type: rotational
+data:
+  n_train: 500
+  n_cal: 500
+  n_scale_est: 250
+  n_test: 5000
+  n_rep: 200
+methods:
+  - global
+  - fullcp
+  - jackknife_plus
+  - partition
+  - twostage
+  - oneshot
+  - trainres
+  - weighted
+  - oracle
+evaluation:
+  alpha: 0.1
+  strata_method: boundary
+  n_strata: 5
+weighting:
+  mode: inverse_sigma
+  source: knn_loo
+seed: 2026

configs/synthetic/D5.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+experiment: d5_heavy_tail
+dgp:
+  name: heavy_tail
+  K: 3
+  sigma_min: 0.1
+  c: 0.5
+  d_x: 2
+  df: 3.0
+data:
+  n_train: 500
+  n_cal: 500
+  n_scale_est: 250
+  n_test: 5000
+  n_rep: 200
+methods:
+  - global
+  - fullcp
+  - jackknife_plus
+  - partition
+  - twostage
+  - weighted
+  - oracle
+evaluation:
+  alpha: 0.1
+  strata_method: boundary
+  n_strata: 5
+weighting:
+  mode: inverse_sigma
+  source: knn_loo
+seed: 2026

configs/synthetic/D6.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+experiment: d6_high_k
+dgp:
+  name: high_k
+  K: 50
+  sigma_min: 0.05
+  c: 0.35
+  d_x: 10
+data:
+  n_train: 5000
+  n_cal: 5000
+  n_scale_est: 2500
+  n_test: 5000
+  n_rep: 200
+methods:
+  - global
+  - partition
+  - twostage
+  - weighted
+  - oracle
+evaluation:
+  alpha: 0.1
+  strata_method: entropy
+  n_strata: 5
+weighting:
+  mode: inverse_sigma
+  source: knn_loo
+seed: 2026

docs/faq.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# FAQ
+## Why are there no raw images or raw headlines here?
+Because this artifact is evaluation-first and respects source-asset terms. The
+benchmark runs on frozen derived arrays and rebuild metadata instead of mirroring
+restricted raw assets.
+## Why are there two upload bundles?
+Splitting data and code keeps the dataset artifact clean and reduces ambiguity
+about what counts as the benchmark state versus what counts as execution logic.
+## What should a reviewer run first?
+The code bundle's figure/table reproduction helpers. They operate on frozen
+derived arrays and are the shortest path to the paper outputs.

docs/release_contract.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# Release Contract
+This code bundle assumes that benchmark evaluation is run on frozen derived arrays.
+It does not require raw-asset mirrors for the paper-level reproduction path.
+The `rebuild/` directories are only for tasks whose source assets are restricted
+or inconvenient to redistribute directly.

docs/restricted_assets.md ADDED Viewed

	@@ -0,0 +1,12 @@

+# Restricted Assets
+The following raw assets are intentionally excluded from the data bundle:
+- CIFAR-10 image archive
+- UTKFace face-image archive
+- Raw AffectiveText headlines
+- Raw AffectiveText API responses
+Rebuild instructions and metadata are provided instead. The benchmark runner is
+designed to consume frozen derived arrays, so raw mirrors are not required for
+the paper-level reproducibility path.

docs/reviewer_quickstart.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# Reviewer Quickstart
+1. Place or symlink the `SimplexTasks-12-data` bundle next to this code bundle.
+2. Create an environment from `environment.yml` or install the packages listed in
+   `requirements.txt`.
+3. Verify the two-bundle layout:
+```bash
+python scripts/check_artifact_integrity.py
+```
+4. Regenerate tables and figures from the frozen cached inputs:
+```bash
+python scripts/reproduce_tables.py
+python scripts/reproduce_figures.py
+```
+5. Inspect `outputs/tables/` and `outputs/figures/`.

docs/task_limitations.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# Task Limitations
+- CIFAR-10 is a classification-style stress test, not a naturally continuous composition task.
+- Topics is model-derived and should be read as topic-mixture evaluation rather than raw-label truth.
+- PBMC is semi-synthetic and is intended as a control-style benchmark slice.
+- UTKFace and AffectiveText rely on derived artifacts because the source assets are restricted.
+- Samson is the cleanest natural low-dimensional composition task in the bundle.

environment.yml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: simplexuq-code
+channels:
+  - conda-forge
+dependencies:
+  - python=3.11
+  - numpy>=1.24
+  - scipy>=1.10
+  - scikit-learn>=1.3
+  - matplotlib>=3.7
+  - pyyaml>=6.0
+  - scanpy
+  - anndata
+  - rpy2
+  - pip

outputs/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Generated benchmark outputs go under this directory.

pyproject.toml ADDED Viewed

	@@ -0,0 +1,28 @@

+[project]
+name = "simplexuq-code"
+version = "0.1.0"
+requires-python = ">=3.10,<3.14"
+dependencies = [
+    "numpy>=1.24",
+    "scipy>=1.10",
+    "scikit-learn>=1.3",
+    "matplotlib>=3.7",
+    "pyyaml>=6.0",
+]
+[project.optional-dependencies]
+bio = ["scanpy", "anndata", "rpy2"]    # for deconvolution experiments
+r = ["rpy2"]                            # for R integration (visualization, scRNA analysis)
+dev = ["pytest", "ruff", "ipykernel"]
+gpu = ["torch>=2.0", "torchvision>=0.15"]  # for CIFAR softmax experiment
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["src*"]
+[tool.ruff]
+line-length = 100
+select = ["E", "F", "I"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]

rebuild/affectivetext/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # affectivetext
2	+
3	+ Rebuild gold labels from the SemEval archive and use the open fallback cache-builder when API access is unavailable.

rebuild/affectivetext/cache_affective_text_open_predictions.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""Build a fully open AffectiveText prediction cache with out-of-fold regressors."""
+from __future__ import annotations
+import argparse
+import json
+import logging
+from pathlib import Path
+import numpy as np
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import KFold
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.preprocessing import Normalizer
+import sys
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from src.data import EMOTION_NAMES, load_affective_text
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+def macro_pearson(a: np.ndarray, b: np.ndarray) -> float:
+    vals = []
+    for j in range(a.shape[1]):
+        aj = a[:, j]
+        bj = b[:, j]
+        if np.std(aj) <= 1e-12 or np.std(bj) <= 1e-12:
+            continue
+        vals.append(float(np.corrcoef(aj, bj)[0, 1]))
+    return float(np.mean(vals)) if vals else float("nan")
+def fit_predict_fold(
+    train_texts: list[str],
+    test_texts: list[str],
+    train_targets: np.ndarray,
+    n_components: int,
+    n_neighbors: int,
+) -> np.ndarray:
+    vectorizer = TfidfVectorizer(
+        lowercase=True,
+        strip_accents="unicode",
+        sublinear_tf=True,
+        ngram_range=(1, 2),
+        min_df=1,
+        max_df=0.95,
+        stop_words="english",
+    )
+    x_train = vectorizer.fit_transform(train_texts)
+    x_test = vectorizer.transform(test_texts)
+    max_rank = min(x_train.shape[0] - 1, x_train.shape[1] - 1)
+    if max_rank >= 2:
+        rank = min(n_components, max_rank)
+        svd = TruncatedSVD(n_components=rank, random_state=0)
+        normalizer = Normalizer(copy=False)
+        x_train = normalizer.fit_transform(svd.fit_transform(x_train))
+        x_test = normalizer.transform(svd.transform(x_test))
+    else:
+        x_train = x_train.toarray()
+        x_test = x_test.toarray()
+    knn = KNeighborsRegressor(
+        n_neighbors=min(n_neighbors, len(train_texts)),
+        weights="distance",
+        metric="minkowski",
+        p=2,
+    )
+    knn.fit(x_train, train_targets)
+    return np.asarray(knn.predict(x_test), dtype=float)
+def build_open_predictions(
+    headlines: list[str],
+    raw_scores: np.ndarray,
+    n_splits: int,
+    n_components: int,
+    n_neighbors: int,
+    seed: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    n = len(headlines)
+    preds = np.zeros_like(raw_scores, dtype=float)
+    folds = np.full(n, -1, dtype=int)
+    splitter = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
+    global_mean = raw_scores.mean(axis=0)
+    for fold_id, (train_idx, test_idx) in enumerate(splitter.split(headlines)):
+        train_texts = [headlines[i] for i in train_idx]
+        test_texts = [headlines[i] for i in test_idx]
+        train_targets = raw_scores[train_idx]
+        fold_preds = fit_predict_fold(
+            train_texts=train_texts,
+            test_texts=test_texts,
+            train_targets=train_targets,
+            n_components=n_components,
+            n_neighbors=n_neighbors,
+        )
+        fold_preds = np.clip(fold_preds, 0.0, None)
+        zero_rows = fold_preds.sum(axis=1) <= 1e-12
+        if np.any(zero_rows):
+            fold_preds[zero_rows] = global_mean
+        preds[test_idx] = fold_preds
+        folds[test_idx] = fold_id
+        log.info("Finished fold %d/%d", fold_id + 1, n_splits)
+    return preds, folds
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-dir", default="data/raw/AffectiveText.Semeval.2007")
+    parser.add_argument("--output", default="data/processed/affective_text_open_oof_predictions.jsonl")
+    parser.add_argument("--n-splits", type=int, default=5)
+    parser.add_argument("--n-components", type=int, default=128)
+    parser.add_argument("--n-neighbors", type=int, default=25)
+    parser.add_argument("--seed", type=int, default=2026)
+    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument("--overwrite", action="store_true")
+    args = parser.parse_args()
+    output_path = Path(args.output)
+    if output_path.exists() and not args.overwrite:
+        raise FileExistsError(f"Output already exists: {output_path}")
+    data = load_affective_text(args.data_dir)
+    ids = data["ids"]
+    headlines = data["headlines"]
+    raw_scores = np.asarray(data["raw_scores"], dtype=float)
+    if args.limit is not None:
+        ids = ids[:args.limit]
+        headlines = headlines[:args.limit]
+        raw_scores = raw_scores[:args.limit]
+    pred_scores, folds = build_open_predictions(
+        headlines=headlines,
+        raw_scores=raw_scores,
+        n_splits=args.n_splits,
+        n_components=args.n_components,
+        n_neighbors=args.n_neighbors,
+        seed=args.seed,
+    )
+    macro_r = macro_pearson(raw_scores, pred_scores)
+    flat_r = float(np.corrcoef(raw_scores.reshape(-1), pred_scores.reshape(-1))[0, 1])
+    log.info(
+        "Open fallback predictor quality: macro Pearson=%.3f, flattened Pearson=%.3f",
+        macro_r,
+        flat_r,
+    )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        for idx, headline, scores, fold_id in zip(ids, headlines, pred_scores, folds):
+            row = {
+                "id": idx,
+                "headline": headline,
+                "emotions": EMOTION_NAMES,
+                "scores": [float(x) for x in scores],
+                "provider": "open_fallback",
+                "model": "tfidf_svd_knn_oof",
+                "fold": int(fold_id),
+                "builder": {
+                    "n_splits": int(args.n_splits),
+                    "n_components": int(args.n_components),
+                    "n_neighbors": int(args.n_neighbors),
+                    "seed": int(args.seed),
+                },
+                "notes": "Deterministic out-of-fold TF-IDF+SVD+kNN regression fallback.",
+            }
+            f.write(json.dumps(row, ensure_ascii=True) + "\n")
+    log.info("Finished. Predictions cached at %s", output_path)
+if __name__ == "__main__":
+    main()

rebuild/affectivetext/rebuild_gold_labels.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from src.data import load_affective_text
+import sys
+from pathlib import Path
+root = Path(sys.argv[1])
+data = load_affective_text(root)
+print(data['Y'].shape)

rebuild/affectivetext/validate_cache_schema.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from pathlib import Path
+import json
+import sys
+p = Path(sys.argv[1])
+with open(p) as f:
+    for i, line in enumerate(f, 1):
+        row = json.loads(line)
+        for field in ['id', 'scores', 'provider', 'prompt_template']:
+            if field not in row:
+                raise SystemExit(f'missing {field} at line {i}')
+print('cache schema ok')

rebuild/cifar10/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # cifar10
2	+
3	+ Use the frozen CIFAR-10 softmax cache when available. If it is absent, regenerate the softmax predictions locally before exporting Y/U arrays.

rebuild/cifar10/rebuild_from_torchvision.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Placeholder wrapper generated for the upload bundle.
+# Use the main scripts/ runners in this repository together with the
+# task-specific README in the same directory.

rebuild/pbmc/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # pbmc
2	+
3	+ Rebuild from PBMC3K, generate pseudobulk mixtures, and then freeze the deconvolution outputs.

rebuild/pbmc/generate_pseudobulk.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Placeholder wrapper generated for the upload bundle.
+# Use the main scripts/ runners in this repository together with the
+# task-specific README in the same directory.

rebuild/pbmc/rebuild_from_pbmc3k.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Placeholder wrapper generated for the upload bundle.
+# Use the main scripts/ runners in this repository together with the
+# task-specific README in the same directory.

rebuild/samson/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # samson
2	+
3	+ Rebuild from the public Samson benchmark bundle and freeze the NMF abundance outputs before conformal evaluation.

rebuild/samson/rebuild_from_public_bundle.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Placeholder wrapper generated for the upload bundle.
+# Use the main scripts/ runners in this repository together with the
+# task-specific README in the same directory.

rebuild/topics/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # topics
2	+
3	+ Rebuild the topic-mixture task from the public 20 Newsgroups fetcher, then freeze the derived Y/U arrays before running the benchmark.

rebuild/topics/rebuild_from_sklearn_fetcher.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Placeholder wrapper generated for the upload bundle.
+# Use the main scripts/ runners in this repository together with the
+# task-specific README in the same directory.

rebuild/utkface/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # utkface
2	+
3	+ Rebuild derived age-distribution features from UTKFace locally; do not mirror the raw face-image archive.

rebuild/utkface/rebuild_from_utkface.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Placeholder wrapper generated for the upload bundle.
+# Use the main scripts/ runners in this repository together with the
+# task-specific README in the same directory.

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+numpy>=1.24
+scipy>=1.10
+scikit-learn>=1.3
+matplotlib>=3.7
+pyyaml>=6.0
+scanpy
+anndata
+rpy2

scripts/build_simplextasks_docs.py ADDED Viewed

	@@ -0,0 +1,297 @@

+"""Build task cards and benchmark docs for the SimplexTasks-12 release."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from textwrap import dedent
+import yaml
+REPO_ROOT = Path(__file__).resolve().parents[1]
+RELEASE_ROOT = REPO_ROOT / "release" / "simplextasks-12"
+DOCS_DIR = RELEASE_ROOT / "docs"
+REAL_EXTRAS = {
+    "cifar10_softmax": {
+        "evaluation_role": "Extreme classification-style stress test for allocation failure under fixed class-probability vectors.",
+        "target_definition": "A 10-way one-hot class distribution paired with a frozen ResNet-18 softmax cache.",
+        "default_score": "Total variation / L1 on the simplex.",
+        "default_stratification": "Entropy bins of the softmax prediction.",
+        "limitations": "This is a classification proxy rather than a naturally continuous composition task. The release ships derived arrays only and does not mirror raw CIFAR-10 images.",
+    },
+    "topics_20ng": {
+        "evaluation_role": "Smooth-heterogeneity real task for topic-mixture prediction.",
+        "target_definition": "Fixed 10-topic compositions constructed once from the 20 Newsgroups corpus, with a TF-IDF-to-topic-mixture kNN regressor as the predictor.",
+        "default_score": "Aitchison distance.",
+        "default_stratification": "Entropy bins of the predicted topic mixture.",
+        "limitations": "Both the target and the predictor output are model-derived topic mixtures. The release exposes derived simplex arrays, not the raw text corpus.",
+    },
+    "samson_unmixing": {
+        "evaluation_role": "Natural low-dimensional compositional benchmark aligned with grouped repair.",
+        "target_definition": "Three-endmember abundance compositions paired with a frozen NMF abundance estimator.",
+        "default_score": "Aitchison distance.",
+        "default_stratification": "Boundary bins on the abundance prediction.",
+        "limitations": "The benchmark files are public and source-cited, but the upstream bundle did not include an explicit license file; downstream reuse should preserve attribution and the original benchmark context.",
+    },
+    "pbmc3k_pseudobulk": {
+        "evaluation_role": "Semi-synthetic control with known composition targets.",
+        "target_definition": "Pseudobulk cell-type fractions generated from PBMC3K together with a fixed NNLS deconvolution predictor.",
+        "default_score": "Aitchison distance.",
+        "default_stratification": "Boundary bins on the predicted cell-type fractions.",
+        "limitations": "This is a semi-synthetic control rather than a deployment-ready deconvolution benchmark. The grouped-repair story is sensitive to the pseudobulk concentration setting, which is documented in the paper appendix.",
+    },
+    "utkface_age_ldl": {
+        "evaluation_role": "Image-based label-distribution task with strong grouped heterogeneity.",
+        "target_definition": "A 10-bin age label distribution constructed from UTKFace ages and predicted from thumbnail image features using PCA+kNN regression.",
+        "default_score": "Aitchison distance.",
+        "default_stratification": "Entropy bins of the predicted age distribution.",
+        "limitations": "The underlying face images are not redistributed and remain subject to the source dataset's research-use constraints. The benchmark ships derived features and simplex arrays only.",
+    },
+    "affectivetext_emotions": {
+        "evaluation_role": "Small-sample weak-structure counterexample with a frozen language-model predictor.",
+        "target_definition": "Normalized six-emotion gold scores from SemEval-2007 AffectiveText paired with a frozen zero-shot scorer cache.",
+        "default_score": "Aitchison distance.",
+        "default_stratification": "Boundary bins on the predicted emotion mixture.",
+        "limitations": "The reported benchmark tables use a frozen closed-model cache; the release omits raw headlines and raw API responses. A fully open TF-IDF+SVD+kNN fallback cache-builder is included for local reproduction, but it is not identical to the main predictor used in the paper tables.",
+    },
+}
+SYNTH_EXTRAS = {
+    "d1_homogeneous": {
+        "evaluation_role": "Negative control with no residual-scale heterogeneity.",
+        "limitations": "Intended to confirm that the benchmark does not manufacture allocation failures when the DGP is homogeneous.",
+    },
+    "d2_pure_scale": {
+        "evaluation_role": "Canonical smooth-scale heterogeneity regime used to motivate normalization-based repair.",
+        "limitations": "This regime isolates scale variation only; it is not intended to cover bias-type misspecification or changing tail shape.",
+    },
+    "d3_discrete_groups_aligned": {
+        "evaluation_role": "Aligned discrete-group regime where group-wise calibration should have an inductive advantage.",
+        "limitations": "The grouping is built into the DGP. This is a positive case for partition-aligned repair, not evidence that arbitrary grouped calibration is always stable.",
+    },
+    "d4_model_bias": {
+        "evaluation_role": "Bias-type counterexample where changing the wrapper alone is not enough.",
+        "limitations": "This regime is not reducible to a single local scale field, so normalization cannot be expected to fully repair it.",
+    },
+    "d5_heavy_tail": {
+        "evaluation_role": "Smooth-scale regime with heavy tails to stress robustness beyond Gaussian residuals.",
+        "limitations": "The regime is still synthetic and targeted; it does not span every possible deviation from the baseline score distribution.",
+    },
+    "d6_high_k": {
+        "evaluation_role": "High-dimensional simplex regime exposing the compute and allocation challenges that appear when K is large.",
+        "limitations": "Benchmark-scale exact methods are too expensive here; the paper marks the exact-reference entries separately as D6 dagger cells.",
+    },
+}
+def load_json(path: Path) -> dict:
+    with open(path) as f:
+        return json.load(f)
+def load_yaml(path: Path) -> dict:
+    with open(path) as f:
+        return yaml.safe_load(f)
+def write(path: Path, text: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(text.rstrip() + "\n")
+def bullet_list(items: list[str]) -> str:
+    return "\n".join(f"- {item}" for item in items)
+def synthetic_task_card(task_dir: Path, metadata: dict, config: dict, extra: dict) -> str:
+    evaluation = config["evaluation"]
+    dgp = config["dgp"]
+    data = config["data"]
+    lines = [
+        f"# {metadata['task_name']} Task Card",
+        "",
+        f"- Task ID: `{metadata['task_id']}`",
+        "- Subset: synthetic",
+        f"- Samples: `{metadata['n_samples']}`",
+        f"- Simplex dimension: `{metadata['simplex_dim']}`",
+        f"- Predictor: {metadata['predictor']}",
+        f"- Regime label: {metadata['regime_label']}",
+        "",
+        "## Evaluation Role",
+        "",
+        extra["evaluation_role"],
+        "",
+        "## DGP Summary",
+        "",
+        f"- DGP family: `{dgp['name']}`",
+        "- Default score: Aitchison distance",
+        f"- Default stratification: `{evaluation['strata_method']}` with `{evaluation['n_strata']}` strata",
+        f"- Calibration size: `{data['n_cal']}`",
+        f"- Test size: `{data['n_test']}`",
+        f"- Repetitions: `{data['n_rep']}`",
+        "",
+        "## Release Contents",
+        "",
+        bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
+        "",
+        "## Provenance And Rebuild",
+        "",
+        f"- Source asset: {metadata['source_asset']}",
+        f"- Config file: `{metadata['config_file']}`",
+        f"- Redistribution: `{metadata['redistribution']}`",
+        f"- Seed: `{metadata['seed']}`",
+        "",
+        "## Limitations",
+        "",
+        extra["limitations"],
+    ]
+    return "\n".join(lines)
+def real_task_card(task_dir: Path, metadata: dict, extra: dict) -> str:
+    lines = [
+        f"# {metadata['task_name']} Task Card",
+        "",
+        f"- Task ID: `{metadata['task_id']}`",
+        "- Subset: real",
+        f"- Samples: `{metadata['n_samples']}`",
+        f"- Simplex dimension: `{metadata['simplex_dim']}`",
+        f"- Predictor: {metadata['predictor']}",
+        "",
+        "## Evaluation Role",
+        "",
+        extra["evaluation_role"],
+        "",
+        "## Target And Predictor",
+        "",
+        extra["target_definition"],
+        "",
+        "## Default Benchmark Settings",
+        "",
+        f"- Default score: {extra['default_score']}",
+        f"- Default stratification: {extra['default_stratification']}",
+        f"- Redistribution: `{metadata['redistribution']}`",
+        "",
+        "## Release Contents",
+        "",
+        bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
+        "",
+        "## Provenance And Usage Notes",
+        "",
+        f"- Source asset: {metadata['source_asset']}",
+        f"- Metadata note: {metadata['notes']}",
+        "",
+        "## Limitations",
+        "",
+        extra["limitations"],
+    ]
+    return "\n".join(lines)
+def build_task_cards() -> None:
+    for task_dir in sorted((RELEASE_ROOT / "real").glob("*")):
+        metadata = load_json(task_dir / "metadata.json")
+        extra = REAL_EXTRAS[metadata["task_id"]]
+        write(task_dir / "task_card.md", real_task_card(task_dir, metadata, extra))
+    for task_dir in sorted((RELEASE_ROOT / "synthetic").glob("*")):
+        metadata = load_json(task_dir / "metadata.json")
+        config = load_yaml(task_dir / "config.yaml")
+        extra = SYNTH_EXTRAS[metadata["task_id"]]
+        write(task_dir / "task_card.md", synthetic_task_card(task_dir, metadata, config, extra))
+def build_docs() -> None:
+    benchmark_card = dedent(
+        """
+        # SimplexTasks-12 Benchmark Card
+        SimplexTasks-12 is the processed task collection behind the SimplexUQ benchmark. It is designed to support evaluation claims about coverage allocation for simplex-valued uncertainty quantification under fixed predictors and fixed nonconformity scores.
+        ## Supported Claims
+        - Whether a fixed simplex-valued predictor exhibits allocation failure beyond its marginal coverage.
+        - Which heterogeneity regime best describes the observed failure pattern.
+        - Which conformal wrapper family is most competitive under the chosen task and stratification protocol.
+        ## Claims The Benchmark Does Not Support
+        - Universal wrapper rankings across all simplex tasks.
+        - Deployment-readiness claims for any predictor.
+        - Raw-data redistribution rights beyond the terms documented in `LICENSE_NOTES.md`.
+        ## Benchmark Contents
+        - 6 synthetic regimes with canonical `task.npz` bundles and copied YAML configs.
+        - 6 fixed-predictor real tasks with cached `(Y, U)` arrays or derived features.
+        - Per-task `task_card.md` files and `metadata.json` provenance records.
+        - Release-level rebuild instructions for the paper tables and figures.
+        ## Reproducibility Contract
+        - Benchmark evaluation always operates on frozen predictor outputs.
+        - Default stratification rules are fixed before wrapper comparison.
+        - Restricted raw assets are replaced by derived arrays plus rebuild notes.
+        - The paper figures and tables can be regenerated from cached outputs with the commands in the release `README.md`.
+        ## Responsible Use
+        Some tasks rely on non-redistributable source assets or derived features from sensitive domains such as face images. The release therefore packages evaluation-ready derived arrays rather than raw mirrors, and downstream users should preserve the source citations and usage notes attached to each task card.
+        """
+    ).strip()
+    evaluation_protocol = dedent(
+        """
+        # SimplexTasks-12 Evaluation Protocol
+        ## Fixed-Predictor Principle
+        Every benchmark slice fixes the predictor first and then varies only the conformal wrapper. This keeps the evaluation target on uncertainty allocation rather than on predictor training.
+        ## Default Scores
+        - Real and synthetic composition tasks use Aitchison distance unless the target lies on simplex vertices.
+        - CIFAR-10 uses total variation / L1 because Aitchison distance is ill-defined at the one-hot boundary.
+        ## Stratification Rules
+        - Default strata are entropy bins, boundary bins, or task-specific dominant-group partitions.
+        - Sensitivity sweeps use fixed alternative stratifications defined from cached prediction vectors only.
+        - Stratification maps are not tuned per wrapper and do not depend on calibration/test responses.
+        ## Main Metrics
+        - Marginal coverage.
+        - Max disparity across prediction-space strata.
+        - Worst-stratum coverage.
+        - Coverage variance.
+        - SSCV and mean radius; low-dimensional synthetic tasks also report a simplex-volume ratio.
+        ## Wrapper Families
+        - Global split conformal.
+        - Group-wise / Mondrian conformal.
+        - Two-stage normalization.
+        - Exact or leave-one-out references where affordable.
+        - Diagnostic variants such as OneShot, TrainRes, and the current weighted implementation.
+        ## Output Interpretation
+        The benchmark is designed to compare wrapper families under visible allocation-efficiency-compute tradeoffs. It should not be reduced to a single leaderboard or read as a conditional-coverage certification protocol.
+        """
+    ).strip()
+    write(DOCS_DIR / "benchmark_card.md", benchmark_card)
+    write(DOCS_DIR / "evaluation_protocol.md", evaluation_protocol)
+def main() -> None:
+    build_task_cards()
+    build_docs()
+    print("Built SimplexTasks-12 task cards and docs.")
+if __name__ == "__main__":
+    main()

scripts/cache_affective_text_open_predictions.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""Build a fully open AffectiveText prediction cache with out-of-fold regressors."""
+from __future__ import annotations
+import argparse
+import json
+import logging
+from pathlib import Path
+import numpy as np
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import KFold
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.preprocessing import Normalizer
+import sys
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from src.data import EMOTION_NAMES, load_affective_text
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+def macro_pearson(a: np.ndarray, b: np.ndarray) -> float:
+    vals = []
+    for j in range(a.shape[1]):
+        aj = a[:, j]
+        bj = b[:, j]
+        if np.std(aj) <= 1e-12 or np.std(bj) <= 1e-12:
+            continue
+        vals.append(float(np.corrcoef(aj, bj)[0, 1]))
+    return float(np.mean(vals)) if vals else float("nan")
+def fit_predict_fold(
+    train_texts: list[str],
+    test_texts: list[str],
+    train_targets: np.ndarray,
+    n_components: int,
+    n_neighbors: int,
+) -> np.ndarray:
+    vectorizer = TfidfVectorizer(
+        lowercase=True,
+        strip_accents="unicode",
+        sublinear_tf=True,
+        ngram_range=(1, 2),
+        min_df=1,
+        max_df=0.95,
+        stop_words="english",
+    )
+    x_train = vectorizer.fit_transform(train_texts)
+    x_test = vectorizer.transform(test_texts)
+    max_rank = min(x_train.shape[0] - 1, x_train.shape[1] - 1)
+    if max_rank >= 2:
+        rank = min(n_components, max_rank)
+        svd = TruncatedSVD(n_components=rank, random_state=0)
+        normalizer = Normalizer(copy=False)
+        x_train = normalizer.fit_transform(svd.fit_transform(x_train))
+        x_test = normalizer.transform(svd.transform(x_test))
+    else:
+        x_train = x_train.toarray()
+        x_test = x_test.toarray()
+    knn = KNeighborsRegressor(
+        n_neighbors=min(n_neighbors, len(train_texts)),
+        weights="distance",
+        metric="minkowski",
+        p=2,
+    )
+    knn.fit(x_train, train_targets)
+    return np.asarray(knn.predict(x_test), dtype=float)
+def build_open_predictions(
+    headlines: list[str],
+    raw_scores: np.ndarray,
+    n_splits: int,
+    n_components: int,
+    n_neighbors: int,
+    seed: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    n = len(headlines)
+    preds = np.zeros_like(raw_scores, dtype=float)
+    folds = np.full(n, -1, dtype=int)
+    splitter = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
+    global_mean = raw_scores.mean(axis=0)
+    for fold_id, (train_idx, test_idx) in enumerate(splitter.split(headlines)):
+        train_texts = [headlines[i] for i in train_idx]
+        test_texts = [headlines[i] for i in test_idx]
+        train_targets = raw_scores[train_idx]
+        fold_preds = fit_predict_fold(
+            train_texts=train_texts,
+            test_texts=test_texts,
+            train_targets=train_targets,
+            n_components=n_components,
+            n_neighbors=n_neighbors,
+        )
+        fold_preds = np.clip(fold_preds, 0.0, None)
+        zero_rows = fold_preds.sum(axis=1) <= 1e-12
+        if np.any(zero_rows):
+            fold_preds[zero_rows] = global_mean
+        preds[test_idx] = fold_preds
+        folds[test_idx] = fold_id
+        log.info("Finished fold %d/%d", fold_id + 1, n_splits)
+    return preds, folds
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-dir", default="data/raw/AffectiveText.Semeval.2007")
+    parser.add_argument("--output", default="data/processed/affective_text_open_oof_predictions.jsonl")
+    parser.add_argument("--n-splits", type=int, default=5)
+    parser.add_argument("--n-components", type=int, default=128)
+    parser.add_argument("--n-neighbors", type=int, default=25)
+    parser.add_argument("--seed", type=int, default=2026)
+    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument("--overwrite", action="store_true")
+    args = parser.parse_args()
+    output_path = Path(args.output)
+    if output_path.exists() and not args.overwrite:
+        raise FileExistsError(f"Output already exists: {output_path}")
+    data = load_affective_text(args.data_dir)
+    ids = data["ids"]
+    headlines = data["headlines"]
+    raw_scores = np.asarray(data["raw_scores"], dtype=float)
+    if args.limit is not None:
+        ids = ids[:args.limit]
+        headlines = headlines[:args.limit]
+        raw_scores = raw_scores[:args.limit]
+    pred_scores, folds = build_open_predictions(
+        headlines=headlines,
+        raw_scores=raw_scores,
+        n_splits=args.n_splits,
+        n_components=args.n_components,
+        n_neighbors=args.n_neighbors,
+        seed=args.seed,
+    )
+    macro_r = macro_pearson(raw_scores, pred_scores)
+    flat_r = float(np.corrcoef(raw_scores.reshape(-1), pred_scores.reshape(-1))[0, 1])
+    log.info(
+        "Open fallback predictor quality: macro Pearson=%.3f, flattened Pearson=%.3f",
+        macro_r,
+        flat_r,
+    )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        for idx, headline, scores, fold_id in zip(ids, headlines, pred_scores, folds):
+            row = {
+                "id": idx,
+                "headline": headline,
+                "emotions": EMOTION_NAMES,
+                "scores": [float(x) for x in scores],
+                "provider": "open_fallback",
+                "model": "tfidf_svd_knn_oof",
+                "fold": int(fold_id),
+                "builder": {
+                    "n_splits": int(args.n_splits),
+                    "n_components": int(args.n_components),
+                    "n_neighbors": int(args.n_neighbors),
+                    "seed": int(args.seed),
+                },
+                "notes": "Deterministic out-of-fold TF-IDF+SVD+kNN regression fallback.",
+            }
+            f.write(json.dumps(row, ensure_ascii=True) + "\n")
+    log.info("Finished. Predictions cached at %s", output_path)
+if __name__ == "__main__":
+    main()

scripts/cache_affective_text_predictions.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""Cache zero-shot API emotion scores for SemEval-2007 Affective Text."""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+import re
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+from pathlib import Path
+import sys
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from src.data import EMOTION_NAMES, load_affective_text, load_prediction_cache
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+PROMPT_TEMPLATE = (
+    'Rate the following news headline on 6 emotions: anger, disgust, fear, joy, sadness, surprise. '
+    'Return only 6 numbers from 0 to 100, comma-separated, in that order.\n'
+    'Headline: "{headline}"\n'
+    "Scores:"
+)
+def parse_scores(text: str) -> list[float]:
+    nums = re.findall(r"-?\d+(?:\.\d+)?", text)
+    if len(nums) < 6:
+        raise ValueError(f"Could not parse 6 scores from response: {text!r}")
+    scores = [max(float(x), 0.0) for x in nums[:6]]
+    if sum(scores) <= 0:
+        raise ValueError(f"Parsed zero-sum scores from response: {text!r}")
+    return scores
+def call_openai_chat_completions(
+    headline: str,
+    model: str,
+    api_key: str,
+    base_url: str,
+    timeout_sec: float,
+) -> tuple[str, dict]:
+    prompt = PROMPT_TEMPLATE.format(headline=headline)
+    payload = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": "You are a precise annotation model."},
+            {"role": "user", "content": prompt},
+        ],
+        "temperature": 0,
+    }
+    req = urllib.request.Request(
+        url=base_url.rstrip("/") + "/chat/completions",
+        data=json.dumps(payload).encode("utf-8"),
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {api_key}",
+        },
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
+        body = json.loads(resp.read().decode("utf-8"))
+    text = body["choices"][0]["message"]["content"]
+    return text, body
+def call_gemini_generate_content(
+    headline: str,
+    model: str,
+    api_key: str,
+    base_url: str,
+    timeout_sec: float,
+) -> tuple[str, dict]:
+    prompt = PROMPT_TEMPLATE.format(headline=headline)
+    payload = {
+        "contents": [
+            {
+                "role": "user",
+                "parts": [{"text": prompt}],
+            }
+        ],
+        "generationConfig": {
+            "temperature": 0,
+        },
+    }
+    url = (
+        base_url.rstrip("/")
+        + f"/models/{model}:generateContent?key={urllib.parse.quote(api_key)}"
+    )
+    req = urllib.request.Request(
+        url=url,
+        data=json.dumps(payload).encode("utf-8"),
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
+        body = json.loads(resp.read().decode("utf-8"))
+    candidates = body.get("candidates", [])
+    if not candidates:
+        raise KeyError(f"No Gemini candidates in response: {body}")
+    parts = candidates[0].get("content", {}).get("parts", [])
+    text = "\n".join(part.get("text", "") for part in parts if part.get("text"))
+    if not text:
+        raise KeyError(f"No text parts in Gemini response: {body}")
+    return text, body
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-dir", default="data/raw/AffectiveText.Semeval.2007")
+    parser.add_argument("--output", default="data/processed/affective_text_predictions.jsonl")
+    parser.add_argument("--provider", choices=["openai", "gemini"], default="gemini")
+    parser.add_argument("--model", default=None)
+    parser.add_argument("--base-url", default=None)
+    parser.add_argument("--api-key-env", default=None)
+    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument("--sleep-sec", type=float, default=0.0)
+    parser.add_argument("--timeout-sec", type=float, default=60.0)
+    parser.add_argument("--overwrite", action="store_true")
+    args = parser.parse_args()
+    if args.model is None:
+        if args.provider == "gemini":
+            args.model = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001")
+        else:
+            args.model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini-2024-07-18")
+    if args.base_url is None:
+        if args.provider == "gemini":
+            args.base_url = os.environ.get("GEMINI_BASE_URL", "https://generativelanguage.googleapis.com/v1beta")
+        else:
+            args.base_url = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
+    if args.api_key_env is None:
+        args.api_key_env = "GEMINI_API_KEY" if args.provider == "gemini" else "OPENAI_API_KEY"
+    api_key = os.environ.get(args.api_key_env)
+    if not api_key:
+        raise EnvironmentError(f"Missing API key in env var {args.api_key_env}")
+    data = load_affective_text(args.data_dir)
+    ids = data["ids"]
+    headlines = data["headlines"]
+    if args.limit is not None:
+        ids = ids[:args.limit]
+        headlines = headlines[:args.limit]
+    out_path = Path(args.output)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    existing = {}
+    if out_path.exists() and not args.overwrite:
+        existing = load_prediction_cache(out_path)
+        log.info(f"Loaded {len(existing)} cached predictions from {out_path}")
+    n_done = 0
+    with open(out_path, "a" if existing and not args.overwrite else "w", encoding="utf-8") as f:
+        for idx, headline in zip(ids, headlines):
+            if idx in existing and not args.overwrite:
+                continue
+            try:
+                if args.provider == "gemini":
+                    raw_text, raw_json = call_gemini_generate_content(
+                        headline=headline,
+                        model=args.model,
+                        api_key=api_key,
+                        base_url=args.base_url,
+                        timeout_sec=args.timeout_sec,
+                    )
+                else:
+                    raw_text, raw_json = call_openai_chat_completions(
+                        headline=headline,
+                        model=args.model,
+                        api_key=api_key,
+                        base_url=args.base_url,
+                        timeout_sec=args.timeout_sec,
+                    )
+                scores = parse_scores(raw_text)
+            except (urllib.error.URLError, urllib.error.HTTPError, ValueError, KeyError) as exc:
+                log.error(f"Failed on id={idx}: {exc}")
+                continue
+            row = {
+                "id": idx,
+                "headline": headline,
+                "emotions": EMOTION_NAMES,
+                "scores": scores,
+                "provider": args.provider,
+                "model": args.model,
+                "base_url": args.base_url,
+                "prompt_template": PROMPT_TEMPLATE,
+                "raw_text": raw_text,
+                "raw_response": raw_json,
+            }
+            f.write(json.dumps(row, ensure_ascii=True) + "\n")
+            f.flush()
+            n_done += 1
+            if n_done % 50 == 0:
+                log.info(f"Cached {n_done} new predictions")
+            if args.sleep_sec > 0:
+                time.sleep(args.sleep_sec)
+    log.info(f"Finished. Predictions cached at {out_path}")
+if __name__ == "__main__":
+    main()