anonymous0523ly commited on
Commit
fc329a3
·
verified ·
1 Parent(s): edfea8c

Initial anonymous code release

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. CITATION.cff +14 -0
  2. LICENSE +6 -0
  3. README.md +20 -0
  4. configs/methods/fullcp.yaml +4 -0
  5. configs/methods/global.yaml +4 -0
  6. configs/methods/jackknife_plus.yaml +4 -0
  7. configs/methods/oneshot.yaml +4 -0
  8. configs/methods/partition.yaml +4 -0
  9. configs/methods/trainres.yaml +5 -0
  10. configs/methods/twostage.yaml +4 -0
  11. configs/methods/weighted.yaml +4 -0
  12. configs/real/affectivetext.yaml +6 -0
  13. configs/real/cifar10_softmax.yaml +6 -0
  14. configs/real/pbmc_pseudobulk.yaml +6 -0
  15. configs/real/samson_unmixing.yaml +6 -0
  16. configs/real/topics_20newsgroups.yaml +6 -0
  17. configs/real/utkface_ldl.yaml +6 -0
  18. configs/synthetic/D1.yaml +35 -0
  19. configs/synthetic/D2.yaml +37 -0
  20. configs/synthetic/D3.yaml +36 -0
  21. configs/synthetic/D4.yaml +39 -0
  22. configs/synthetic/D5.yaml +36 -0
  23. configs/synthetic/D6.yaml +33 -0
  24. docs/faq.md +14 -0
  25. docs/release_contract.md +7 -0
  26. docs/restricted_assets.md +12 -0
  27. docs/reviewer_quickstart.md +19 -0
  28. docs/task_limitations.md +7 -0
  29. environment.yml +14 -0
  30. outputs/README.md +1 -0
  31. pyproject.toml +28 -0
  32. rebuild/affectivetext/README.md +3 -0
  33. rebuild/affectivetext/cache_affective_text_open_predictions.py +179 -0
  34. rebuild/affectivetext/rebuild_gold_labels.py +6 -0
  35. rebuild/affectivetext/validate_cache_schema.py +11 -0
  36. rebuild/cifar10/README.md +3 -0
  37. rebuild/cifar10/rebuild_from_torchvision.py +3 -0
  38. rebuild/pbmc/README.md +3 -0
  39. rebuild/pbmc/generate_pseudobulk.py +3 -0
  40. rebuild/pbmc/rebuild_from_pbmc3k.py +3 -0
  41. rebuild/samson/README.md +3 -0
  42. rebuild/samson/rebuild_from_public_bundle.py +3 -0
  43. rebuild/topics/README.md +3 -0
  44. rebuild/topics/rebuild_from_sklearn_fetcher.py +3 -0
  45. rebuild/utkface/README.md +3 -0
  46. rebuild/utkface/rebuild_from_utkface.py +3 -0
  47. requirements.txt +8 -0
  48. scripts/build_simplextasks_docs.py +297 -0
  49. scripts/cache_affective_text_open_predictions.py +179 -0
  50. scripts/cache_affective_text_predictions.py +209 -0
CITATION.cff ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: 1.2.0
2
+ title: "SimplexUQ code artifact"
3
+ message: "If you use this benchmark code, please cite the accompanying benchmark paper."
4
+ type: software
5
+ version: 0.1.0
6
+ authors:
7
+ - family-names: "Authors"
8
+ given-names: "Anonymous"
9
+ abstract: "Executable code artifact for reproducing the SimplexUQ benchmark figures and tables from frozen derived arrays."
10
+ keywords:
11
+ - simplex
12
+ - conformal prediction
13
+ - benchmark
14
+ license: "other"
LICENSE ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Code Artifact License Notice
2
+
3
+ This anonymous code bundle is provided for NeurIPS E&D review and benchmark
4
+ reproduction. It does not grant rights to redistribute restricted source
5
+ datasets or raw API outputs. Use the code together with the provenance and
6
+ restricted-asset notes shipped in the paired SimplexTasks-12 data bundle.
README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # simplexuq-code
2
+
3
+ Anonymous code bundle for the SimplexUQ benchmark.
4
+
5
+ This repository is meant to be paired with the `SimplexTasks-12-data` dataset
6
+ artifact. It contains:
7
+
8
+ - `src/` benchmark logic and utility code
9
+ - `scripts/` benchmark runners and figure/table reproducers
10
+ - `rebuild/` task-specific rebuild notes for restricted assets
11
+ - `configs/` synthetic, real, and method configuration files
12
+ - `docs/` reviewer-facing quickstart and release notes
13
+
14
+ Typical usage:
15
+
16
+ ```bash
17
+ python scripts/check_artifact_integrity.py
18
+ python scripts/reproduce_tables.py
19
+ python scripts/reproduce_figures.py
20
+ ```
configs/methods/fullcp.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ method: fullcp
2
+ family: exact / local-scale reference
3
+ paper_label: FullCP
4
+ validity: Exact marginal, expensive in large settings
configs/methods/global.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ method: global
2
+ family: split
3
+ paper_label: Global
4
+ validity: Exact marginal under exchangeability
configs/methods/jackknife_plus.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ method: jackknife_plus
2
+ family: leave-one-out reference
3
+ paper_label: Jackknife+
4
+ validity: Approximate or exact depending on setting; used as a reference in this benchmark
configs/methods/oneshot.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ method: oneshot
2
+ family: diagnostic normalization
3
+ paper_label: OneShot
4
+ validity: No general exchangeability guarantee
configs/methods/partition.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ method: partition
2
+ family: group-wise
3
+ paper_label: Mondrian
4
+ validity: Exact within fixed groups
configs/methods/trainres.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ method: trainres
2
+ family: training-residual normalization
3
+ paper_label: TrainRes
4
+ validity: Can retain marginal validity under strong conditions but may misallocate
5
+ badly
configs/methods/twostage.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ method: twostage
2
+ family: normalized split
3
+ paper_label: TwoStage
4
+ validity: Exact marginal when the scale fit is independent
configs/methods/weighted.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ method: weighted
2
+ family: weighted conformal diagnostic
3
+ paper_label: Weighted
4
+ validity: Implementation-specific diagnostic only in this benchmark
configs/real/affectivetext.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ task_id: affectivetext_emotions
2
+ default_score: aitchison
3
+ default_stratification: boundary
4
+ alpha: 0.1
5
+ repetitions: 200
6
+ benchmark_mode: fixed_predictor
configs/real/cifar10_softmax.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ task_id: cifar10_softmax
2
+ default_score: tv
3
+ default_stratification: entropy
4
+ alpha: 0.1
5
+ repetitions: 50
6
+ benchmark_mode: fixed_predictor
configs/real/pbmc_pseudobulk.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ task_id: pbmc3k_pseudobulk
2
+ default_score: aitchison
3
+ default_stratification: boundary
4
+ alpha: 0.1
5
+ repetitions: 200
6
+ benchmark_mode: fixed_predictor
configs/real/samson_unmixing.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ task_id: samson_unmixing
2
+ default_score: aitchison
3
+ default_stratification: boundary
4
+ alpha: 0.1
5
+ repetitions: 50
6
+ benchmark_mode: fixed_predictor
configs/real/topics_20newsgroups.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ task_id: topics_20ng
2
+ default_score: aitchison
3
+ default_stratification: entropy
4
+ alpha: 0.1
5
+ repetitions: 50
6
+ benchmark_mode: fixed_predictor
configs/real/utkface_ldl.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ task_id: utkface_age_ldl
2
+ default_score: aitchison
3
+ default_stratification: entropy
4
+ alpha: 0.1
5
+ repetitions: 50
6
+ benchmark_mode: fixed_predictor
configs/synthetic/D1.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment: d1_homogeneous
2
+
3
+ dgp:
4
+ name: pure_scale
5
+ K: 3
6
+ sigma_min: 0.2
7
+ c: 0.0
8
+ d_x: 2
9
+
10
+ data:
11
+ n_train: 1000
12
+ n_cal: 1000
13
+ n_scale_est: 500
14
+ n_test: 5000
15
+ n_rep: 200
16
+
17
+ methods:
18
+ - global
19
+ - partition
20
+ - twostage
21
+ - oneshot
22
+ - trainres
23
+ - weighted
24
+ - oracle
25
+
26
+ evaluation:
27
+ alpha: 0.1
28
+ strata_method: boundary
29
+ n_strata: 5
30
+
31
+ weighting:
32
+ mode: inverse_sigma
33
+ source: knn_loo
34
+
35
+ seed: 2026
configs/synthetic/D2.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment: d2_pure_scale
2
+
3
+ dgp:
4
+ name: pure_scale
5
+ K: 3
6
+ sigma_min: 0.1
7
+ c: 0.5
8
+ d_x: 2
9
+
10
+ data:
11
+ n_train: 500
12
+ n_cal: 500
13
+ n_scale_est: 250
14
+ n_test: 5000
15
+ n_rep: 200
16
+
17
+ methods:
18
+ - global
19
+ - fullcp
20
+ - jackknife_plus
21
+ - partition
22
+ - twostage
23
+ - oneshot
24
+ - trainres
25
+ - weighted
26
+ - oracle
27
+
28
+ evaluation:
29
+ alpha: 0.1
30
+ strata_method: boundary
31
+ n_strata: 5
32
+
33
+ weighting:
34
+ mode: inverse_sigma
35
+ source: knn_loo
36
+
37
+ seed: 2026
configs/synthetic/D3.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment: d3_discrete_groups_aligned
2
+
3
+ dgp:
4
+ name: discrete_groups
5
+ K: 10
6
+ sigma_low: 0.08
7
+ sigma_high: 0.30
8
+ d_x: 5
9
+ easy_classes: 5
10
+
11
+ data:
12
+ n_train: 500
13
+ n_cal: 500
14
+ n_scale_est: 250
15
+ n_test: 5000
16
+ n_rep: 200
17
+
18
+ methods:
19
+ - global
20
+ - partition
21
+ - twostage
22
+ - fullcp
23
+ - jackknife_plus
24
+ - oracle
25
+
26
+ evaluation:
27
+ alpha: 0.1
28
+ strata_method: argmax_group
29
+ n_strata: 2
30
+ split_index: 5
31
+
32
+ weighting:
33
+ mode: inverse_sigma
34
+ source: knn_loo
35
+
36
+ seed: 2026
configs/synthetic/D4.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment: d4_model_bias
2
+
3
+ dgp:
4
+ name: model_bias
5
+ K: 3
6
+ sigma_min: 0.1
7
+ c: 0.15
8
+ d_x: 2
9
+ bias_scale: 0.45
10
+ bias_type: rotational
11
+
12
+ data:
13
+ n_train: 500
14
+ n_cal: 500
15
+ n_scale_est: 250
16
+ n_test: 5000
17
+ n_rep: 200
18
+
19
+ methods:
20
+ - global
21
+ - fullcp
22
+ - jackknife_plus
23
+ - partition
24
+ - twostage
25
+ - oneshot
26
+ - trainres
27
+ - weighted
28
+ - oracle
29
+
30
+ evaluation:
31
+ alpha: 0.1
32
+ strata_method: boundary
33
+ n_strata: 5
34
+
35
+ weighting:
36
+ mode: inverse_sigma
37
+ source: knn_loo
38
+
39
+ seed: 2026
configs/synthetic/D5.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment: d5_heavy_tail
2
+
3
+ dgp:
4
+ name: heavy_tail
5
+ K: 3
6
+ sigma_min: 0.1
7
+ c: 0.5
8
+ d_x: 2
9
+ df: 3.0
10
+
11
+ data:
12
+ n_train: 500
13
+ n_cal: 500
14
+ n_scale_est: 250
15
+ n_test: 5000
16
+ n_rep: 200
17
+
18
+ methods:
19
+ - global
20
+ - fullcp
21
+ - jackknife_plus
22
+ - partition
23
+ - twostage
24
+ - weighted
25
+ - oracle
26
+
27
+ evaluation:
28
+ alpha: 0.1
29
+ strata_method: boundary
30
+ n_strata: 5
31
+
32
+ weighting:
33
+ mode: inverse_sigma
34
+ source: knn_loo
35
+
36
+ seed: 2026
configs/synthetic/D6.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment: d6_high_k
2
+
3
+ dgp:
4
+ name: high_k
5
+ K: 50
6
+ sigma_min: 0.05
7
+ c: 0.35
8
+ d_x: 10
9
+
10
+ data:
11
+ n_train: 5000
12
+ n_cal: 5000
13
+ n_scale_est: 2500
14
+ n_test: 5000
15
+ n_rep: 200
16
+
17
+ methods:
18
+ - global
19
+ - partition
20
+ - twostage
21
+ - weighted
22
+ - oracle
23
+
24
+ evaluation:
25
+ alpha: 0.1
26
+ strata_method: entropy
27
+ n_strata: 5
28
+
29
+ weighting:
30
+ mode: inverse_sigma
31
+ source: knn_loo
32
+
33
+ seed: 2026
docs/faq.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FAQ
2
+
3
+ ## Why are there no raw images or raw headlines here?
4
+ Because this artifact is evaluation-first and respects source-asset terms. The
5
+ benchmark runs on frozen derived arrays and rebuild metadata instead of mirroring
6
+ restricted raw assets.
7
+
8
+ ## Why are there two upload bundles?
9
+ Splitting data and code keeps the dataset artifact clean and reduces ambiguity
10
+ about what counts as the benchmark state versus what counts as execution logic.
11
+
12
+ ## What should a reviewer run first?
13
+ The code bundle's figure/table reproduction helpers. They operate on frozen
14
+ derived arrays and are the shortest path to the paper outputs.
docs/release_contract.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Release Contract
2
+
3
+ This code bundle assumes that benchmark evaluation is run on frozen derived arrays.
4
+ It does not require raw-asset mirrors for the paper-level reproduction path.
5
+
6
+ The `rebuild/` directories are only for tasks whose source assets are restricted
7
+ or inconvenient to redistribute directly.
docs/restricted_assets.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Restricted Assets
2
+
3
+ The following raw assets are intentionally excluded from the data bundle:
4
+
5
+ - CIFAR-10 image archive
6
+ - UTKFace face-image archive
7
+ - Raw AffectiveText headlines
8
+ - Raw AffectiveText API responses
9
+
10
+ Rebuild instructions and metadata are provided instead. The benchmark runner is
11
+ designed to consume frozen derived arrays, so raw mirrors are not required for
12
+ the paper-level reproducibility path.
docs/reviewer_quickstart.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reviewer Quickstart
2
+
3
+ 1. Place or symlink the `SimplexTasks-12-data` bundle next to this code bundle.
4
+ 2. Create an environment from `environment.yml` or install the packages listed in
5
+ `requirements.txt`.
6
+ 3. Verify the two-bundle layout:
7
+
8
+ ```bash
9
+ python scripts/check_artifact_integrity.py
10
+ ```
11
+
12
+ 4. Regenerate tables and figures from the frozen cached inputs:
13
+
14
+ ```bash
15
+ python scripts/reproduce_tables.py
16
+ python scripts/reproduce_figures.py
17
+ ```
18
+
19
+ 5. Inspect `outputs/tables/` and `outputs/figures/`.
docs/task_limitations.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Task Limitations
2
+
3
+ - CIFAR-10 is a classification-style stress test, not a naturally continuous composition task.
4
+ - Topics is model-derived and should be read as topic-mixture evaluation rather than raw-label truth.
5
+ - PBMC is semi-synthetic and is intended as a control-style benchmark slice.
6
+ - UTKFace and AffectiveText rely on derived artifacts because the source assets are restricted.
7
+ - Samson is the cleanest natural low-dimensional composition task in the bundle.
environment.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: simplexuq-code
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - python=3.11
6
+ - numpy>=1.24
7
+ - scipy>=1.10
8
+ - scikit-learn>=1.3
9
+ - matplotlib>=3.7
10
+ - pyyaml>=6.0
11
+ - scanpy
12
+ - anndata
13
+ - rpy2
14
+ - pip
outputs/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ Generated benchmark outputs go under this directory.
pyproject.toml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "simplexuq-code"
3
+ version = "0.1.0"
4
+ requires-python = ">=3.10,<3.14"
5
+ dependencies = [
6
+ "numpy>=1.24",
7
+ "scipy>=1.10",
8
+ "scikit-learn>=1.3",
9
+ "matplotlib>=3.7",
10
+ "pyyaml>=6.0",
11
+ ]
12
+
13
+ [project.optional-dependencies]
14
+ bio = ["scanpy", "anndata", "rpy2"] # for deconvolution experiments
15
+ r = ["rpy2"] # for R integration (visualization, scRNA analysis)
16
+ dev = ["pytest", "ruff", "ipykernel"]
17
+ gpu = ["torch>=2.0", "torchvision>=0.15"] # for CIFAR softmax experiment
18
+
19
+ [tool.setuptools.packages.find]
20
+ where = ["."]
21
+ include = ["src*"]
22
+
23
+ [tool.ruff]
24
+ line-length = 100
25
+ select = ["E", "F", "I"]
26
+
27
+ [tool.pytest.ini_options]
28
+ testpaths = ["tests"]
rebuild/affectivetext/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # affectivetext
2
+
3
+ Rebuild gold labels from the SemEval archive and use the open fallback cache-builder when API access is unavailable.
rebuild/affectivetext/cache_affective_text_open_predictions.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build a fully open AffectiveText prediction cache with out-of-fold regressors."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ from sklearn.decomposition import TruncatedSVD
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.model_selection import KFold
13
+ from sklearn.neighbors import KNeighborsRegressor
14
+ from sklearn.preprocessing import Normalizer
15
+
16
+ import sys
17
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
18
+
19
+ from src.data import EMOTION_NAMES, load_affective_text
20
+
21
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
22
+ log = logging.getLogger(__name__)
23
+
24
+
25
+ def macro_pearson(a: np.ndarray, b: np.ndarray) -> float:
26
+ vals = []
27
+ for j in range(a.shape[1]):
28
+ aj = a[:, j]
29
+ bj = b[:, j]
30
+ if np.std(aj) <= 1e-12 or np.std(bj) <= 1e-12:
31
+ continue
32
+ vals.append(float(np.corrcoef(aj, bj)[0, 1]))
33
+ return float(np.mean(vals)) if vals else float("nan")
34
+
35
+
36
+ def fit_predict_fold(
37
+ train_texts: list[str],
38
+ test_texts: list[str],
39
+ train_targets: np.ndarray,
40
+ n_components: int,
41
+ n_neighbors: int,
42
+ ) -> np.ndarray:
43
+ vectorizer = TfidfVectorizer(
44
+ lowercase=True,
45
+ strip_accents="unicode",
46
+ sublinear_tf=True,
47
+ ngram_range=(1, 2),
48
+ min_df=1,
49
+ max_df=0.95,
50
+ stop_words="english",
51
+ )
52
+ x_train = vectorizer.fit_transform(train_texts)
53
+ x_test = vectorizer.transform(test_texts)
54
+
55
+ max_rank = min(x_train.shape[0] - 1, x_train.shape[1] - 1)
56
+ if max_rank >= 2:
57
+ rank = min(n_components, max_rank)
58
+ svd = TruncatedSVD(n_components=rank, random_state=0)
59
+ normalizer = Normalizer(copy=False)
60
+ x_train = normalizer.fit_transform(svd.fit_transform(x_train))
61
+ x_test = normalizer.transform(svd.transform(x_test))
62
+ else:
63
+ x_train = x_train.toarray()
64
+ x_test = x_test.toarray()
65
+
66
+ knn = KNeighborsRegressor(
67
+ n_neighbors=min(n_neighbors, len(train_texts)),
68
+ weights="distance",
69
+ metric="minkowski",
70
+ p=2,
71
+ )
72
+ knn.fit(x_train, train_targets)
73
+ return np.asarray(knn.predict(x_test), dtype=float)
74
+
75
+
76
+ def build_open_predictions(
77
+ headlines: list[str],
78
+ raw_scores: np.ndarray,
79
+ n_splits: int,
80
+ n_components: int,
81
+ n_neighbors: int,
82
+ seed: int,
83
+ ) -> tuple[np.ndarray, np.ndarray]:
84
+ n = len(headlines)
85
+ preds = np.zeros_like(raw_scores, dtype=float)
86
+ folds = np.full(n, -1, dtype=int)
87
+ splitter = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
88
+ global_mean = raw_scores.mean(axis=0)
89
+
90
+ for fold_id, (train_idx, test_idx) in enumerate(splitter.split(headlines)):
91
+ train_texts = [headlines[i] for i in train_idx]
92
+ test_texts = [headlines[i] for i in test_idx]
93
+ train_targets = raw_scores[train_idx]
94
+ fold_preds = fit_predict_fold(
95
+ train_texts=train_texts,
96
+ test_texts=test_texts,
97
+ train_targets=train_targets,
98
+ n_components=n_components,
99
+ n_neighbors=n_neighbors,
100
+ )
101
+ fold_preds = np.clip(fold_preds, 0.0, None)
102
+ zero_rows = fold_preds.sum(axis=1) <= 1e-12
103
+ if np.any(zero_rows):
104
+ fold_preds[zero_rows] = global_mean
105
+ preds[test_idx] = fold_preds
106
+ folds[test_idx] = fold_id
107
+ log.info("Finished fold %d/%d", fold_id + 1, n_splits)
108
+
109
+ return preds, folds
110
+
111
+
112
+ def main() -> None:
113
+ parser = argparse.ArgumentParser()
114
+ parser.add_argument("--data-dir", default="data/raw/AffectiveText.Semeval.2007")
115
+ parser.add_argument("--output", default="data/processed/affective_text_open_oof_predictions.jsonl")
116
+ parser.add_argument("--n-splits", type=int, default=5)
117
+ parser.add_argument("--n-components", type=int, default=128)
118
+ parser.add_argument("--n-neighbors", type=int, default=25)
119
+ parser.add_argument("--seed", type=int, default=2026)
120
+ parser.add_argument("--limit", type=int, default=None)
121
+ parser.add_argument("--overwrite", action="store_true")
122
+ args = parser.parse_args()
123
+
124
+ output_path = Path(args.output)
125
+ if output_path.exists() and not args.overwrite:
126
+ raise FileExistsError(f"Output already exists: {output_path}")
127
+
128
+ data = load_affective_text(args.data_dir)
129
+ ids = data["ids"]
130
+ headlines = data["headlines"]
131
+ raw_scores = np.asarray(data["raw_scores"], dtype=float)
132
+ if args.limit is not None:
133
+ ids = ids[:args.limit]
134
+ headlines = headlines[:args.limit]
135
+ raw_scores = raw_scores[:args.limit]
136
+
137
+ pred_scores, folds = build_open_predictions(
138
+ headlines=headlines,
139
+ raw_scores=raw_scores,
140
+ n_splits=args.n_splits,
141
+ n_components=args.n_components,
142
+ n_neighbors=args.n_neighbors,
143
+ seed=args.seed,
144
+ )
145
+
146
+ macro_r = macro_pearson(raw_scores, pred_scores)
147
+ flat_r = float(np.corrcoef(raw_scores.reshape(-1), pred_scores.reshape(-1))[0, 1])
148
+ log.info(
149
+ "Open fallback predictor quality: macro Pearson=%.3f, flattened Pearson=%.3f",
150
+ macro_r,
151
+ flat_r,
152
+ )
153
+
154
+ output_path.parent.mkdir(parents=True, exist_ok=True)
155
+ with open(output_path, "w", encoding="utf-8") as f:
156
+ for idx, headline, scores, fold_id in zip(ids, headlines, pred_scores, folds):
157
+ row = {
158
+ "id": idx,
159
+ "headline": headline,
160
+ "emotions": EMOTION_NAMES,
161
+ "scores": [float(x) for x in scores],
162
+ "provider": "open_fallback",
163
+ "model": "tfidf_svd_knn_oof",
164
+ "fold": int(fold_id),
165
+ "builder": {
166
+ "n_splits": int(args.n_splits),
167
+ "n_components": int(args.n_components),
168
+ "n_neighbors": int(args.n_neighbors),
169
+ "seed": int(args.seed),
170
+ },
171
+ "notes": "Deterministic out-of-fold TF-IDF+SVD+kNN regression fallback.",
172
+ }
173
+ f.write(json.dumps(row, ensure_ascii=True) + "\n")
174
+
175
+ log.info("Finished. Predictions cached at %s", output_path)
176
+
177
+
178
+ if __name__ == "__main__":
179
+ main()
rebuild/affectivetext/rebuild_gold_labels.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from src.data import load_affective_text
2
+ import sys
3
+ from pathlib import Path
4
+ root = Path(sys.argv[1])
5
+ data = load_affective_text(root)
6
+ print(data['Y'].shape)
rebuild/affectivetext/validate_cache_schema.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+ import sys
4
+ p = Path(sys.argv[1])
5
+ with open(p) as f:
6
+ for i, line in enumerate(f, 1):
7
+ row = json.loads(line)
8
+ for field in ['id', 'scores', 'provider', 'prompt_template']:
9
+ if field not in row:
10
+ raise SystemExit(f'missing {field} at line {i}')
11
+ print('cache schema ok')
rebuild/cifar10/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # cifar10
2
+
3
+ Use the frozen CIFAR-10 softmax cache when available. If it is absent, regenerate the softmax predictions locally before exporting Y/U arrays.
rebuild/cifar10/rebuild_from_torchvision.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Placeholder wrapper generated for the upload bundle.
2
+ # Use the main scripts/ runners in this repository together with the
3
+ # task-specific README in the same directory.
rebuild/pbmc/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # pbmc
2
+
3
+ Rebuild from PBMC3K, generate pseudobulk mixtures, and then freeze the deconvolution outputs.
rebuild/pbmc/generate_pseudobulk.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Placeholder wrapper generated for the upload bundle.
2
+ # Use the main scripts/ runners in this repository together with the
3
+ # task-specific README in the same directory.
rebuild/pbmc/rebuild_from_pbmc3k.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Placeholder wrapper generated for the upload bundle.
2
+ # Use the main scripts/ runners in this repository together with the
3
+ # task-specific README in the same directory.
rebuild/samson/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # samson
2
+
3
+ Rebuild from the public Samson benchmark bundle and freeze the NMF abundance outputs before conformal evaluation.
rebuild/samson/rebuild_from_public_bundle.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Placeholder wrapper generated for the upload bundle.
2
+ # Use the main scripts/ runners in this repository together with the
3
+ # task-specific README in the same directory.
rebuild/topics/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # topics
2
+
3
+ Rebuild the topic-mixture task from the public 20 Newsgroups fetcher, then freeze the derived Y/U arrays before running the benchmark.
rebuild/topics/rebuild_from_sklearn_fetcher.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Placeholder wrapper generated for the upload bundle.
2
+ # Use the main scripts/ runners in this repository together with the
3
+ # task-specific README in the same directory.
rebuild/utkface/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # utkface
2
+
3
+ Rebuild derived age-distribution features from UTKFace locally; do not mirror the raw face-image archive.
rebuild/utkface/rebuild_from_utkface.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Placeholder wrapper generated for the upload bundle.
2
+ # Use the main scripts/ runners in this repository together with the
3
+ # task-specific README in the same directory.
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy>=1.24
2
+ scipy>=1.10
3
+ scikit-learn>=1.3
4
+ matplotlib>=3.7
5
+ pyyaml>=6.0
6
+ scanpy
7
+ anndata
8
+ rpy2
scripts/build_simplextasks_docs.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build task cards and benchmark docs for the SimplexTasks-12 release."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from textwrap import dedent
8
+
9
+ import yaml
10
+
11
+
12
+ REPO_ROOT = Path(__file__).resolve().parents[1]
13
+ RELEASE_ROOT = REPO_ROOT / "release" / "simplextasks-12"
14
+ DOCS_DIR = RELEASE_ROOT / "docs"
15
+
16
+ REAL_EXTRAS = {
17
+ "cifar10_softmax": {
18
+ "evaluation_role": "Extreme classification-style stress test for allocation failure under fixed class-probability vectors.",
19
+ "target_definition": "A 10-way one-hot class distribution paired with a frozen ResNet-18 softmax cache.",
20
+ "default_score": "Total variation / L1 on the simplex.",
21
+ "default_stratification": "Entropy bins of the softmax prediction.",
22
+ "limitations": "This is a classification proxy rather than a naturally continuous composition task. The release ships derived arrays only and does not mirror raw CIFAR-10 images.",
23
+ },
24
+ "topics_20ng": {
25
+ "evaluation_role": "Smooth-heterogeneity real task for topic-mixture prediction.",
26
+ "target_definition": "Fixed 10-topic compositions constructed once from the 20 Newsgroups corpus, with a TF-IDF-to-topic-mixture kNN regressor as the predictor.",
27
+ "default_score": "Aitchison distance.",
28
+ "default_stratification": "Entropy bins of the predicted topic mixture.",
29
+ "limitations": "Both the target and the predictor output are model-derived topic mixtures. The release exposes derived simplex arrays, not the raw text corpus.",
30
+ },
31
+ "samson_unmixing": {
32
+ "evaluation_role": "Natural low-dimensional compositional benchmark aligned with grouped repair.",
33
+ "target_definition": "Three-endmember abundance compositions paired with a frozen NMF abundance estimator.",
34
+ "default_score": "Aitchison distance.",
35
+ "default_stratification": "Boundary bins on the abundance prediction.",
36
+ "limitations": "The benchmark files are public and source-cited, but the upstream bundle did not include an explicit license file; downstream reuse should preserve attribution and the original benchmark context.",
37
+ },
38
+ "pbmc3k_pseudobulk": {
39
+ "evaluation_role": "Semi-synthetic control with known composition targets.",
40
+ "target_definition": "Pseudobulk cell-type fractions generated from PBMC3K together with a fixed NNLS deconvolution predictor.",
41
+ "default_score": "Aitchison distance.",
42
+ "default_stratification": "Boundary bins on the predicted cell-type fractions.",
43
+ "limitations": "This is a semi-synthetic control rather than a deployment-ready deconvolution benchmark. The grouped-repair story is sensitive to the pseudobulk concentration setting, which is documented in the paper appendix.",
44
+ },
45
+ "utkface_age_ldl": {
46
+ "evaluation_role": "Image-based label-distribution task with strong grouped heterogeneity.",
47
+ "target_definition": "A 10-bin age label distribution constructed from UTKFace ages and predicted from thumbnail image features using PCA+kNN regression.",
48
+ "default_score": "Aitchison distance.",
49
+ "default_stratification": "Entropy bins of the predicted age distribution.",
50
+ "limitations": "The underlying face images are not redistributed and remain subject to the source dataset's research-use constraints. The benchmark ships derived features and simplex arrays only.",
51
+ },
52
+ "affectivetext_emotions": {
53
+ "evaluation_role": "Small-sample weak-structure counterexample with a frozen language-model predictor.",
54
+ "target_definition": "Normalized six-emotion gold scores from SemEval-2007 AffectiveText paired with a frozen zero-shot scorer cache.",
55
+ "default_score": "Aitchison distance.",
56
+ "default_stratification": "Boundary bins on the predicted emotion mixture.",
57
+ "limitations": "The reported benchmark tables use a frozen closed-model cache; the release omits raw headlines and raw API responses. A fully open TF-IDF+SVD+kNN fallback cache-builder is included for local reproduction, but it is not identical to the main predictor used in the paper tables.",
58
+ },
59
+ }
60
+
61
+ SYNTH_EXTRAS = {
62
+ "d1_homogeneous": {
63
+ "evaluation_role": "Negative control with no residual-scale heterogeneity.",
64
+ "limitations": "Intended to confirm that the benchmark does not manufacture allocation failures when the DGP is homogeneous.",
65
+ },
66
+ "d2_pure_scale": {
67
+ "evaluation_role": "Canonical smooth-scale heterogeneity regime used to motivate normalization-based repair.",
68
+ "limitations": "This regime isolates scale variation only; it is not intended to cover bias-type misspecification or changing tail shape.",
69
+ },
70
+ "d3_discrete_groups_aligned": {
71
+ "evaluation_role": "Aligned discrete-group regime where group-wise calibration should have an inductive advantage.",
72
+ "limitations": "The grouping is built into the DGP. This is a positive case for partition-aligned repair, not evidence that arbitrary grouped calibration is always stable.",
73
+ },
74
+ "d4_model_bias": {
75
+ "evaluation_role": "Bias-type counterexample where changing the wrapper alone is not enough.",
76
+ "limitations": "This regime is not reducible to a single local scale field, so normalization cannot be expected to fully repair it.",
77
+ },
78
+ "d5_heavy_tail": {
79
+ "evaluation_role": "Smooth-scale regime with heavy tails to stress robustness beyond Gaussian residuals.",
80
+ "limitations": "The regime is still synthetic and targeted; it does not span every possible deviation from the baseline score distribution.",
81
+ },
82
+ "d6_high_k": {
83
+ "evaluation_role": "High-dimensional simplex regime exposing the compute and allocation challenges that appear when K is large.",
84
+ "limitations": "Benchmark-scale exact methods are too expensive here; the paper marks the exact-reference entries separately as D6 dagger cells.",
85
+ },
86
+ }
87
+
88
+
89
+ def load_json(path: Path) -> dict:
90
+ with open(path) as f:
91
+ return json.load(f)
92
+
93
+
94
+ def load_yaml(path: Path) -> dict:
95
+ with open(path) as f:
96
+ return yaml.safe_load(f)
97
+
98
+
99
+ def write(path: Path, text: str) -> None:
100
+ path.parent.mkdir(parents=True, exist_ok=True)
101
+ path.write_text(text.rstrip() + "\n")
102
+
103
+
104
+ def bullet_list(items: list[str]) -> str:
105
+ return "\n".join(f"- {item}" for item in items)
106
+
107
+
108
+ def synthetic_task_card(task_dir: Path, metadata: dict, config: dict, extra: dict) -> str:
109
+ evaluation = config["evaluation"]
110
+ dgp = config["dgp"]
111
+ data = config["data"]
112
+ lines = [
113
+ f"# {metadata['task_name']} Task Card",
114
+ "",
115
+ f"- Task ID: `{metadata['task_id']}`",
116
+ "- Subset: synthetic",
117
+ f"- Samples: `{metadata['n_samples']}`",
118
+ f"- Simplex dimension: `{metadata['simplex_dim']}`",
119
+ f"- Predictor: {metadata['predictor']}",
120
+ f"- Regime label: {metadata['regime_label']}",
121
+ "",
122
+ "## Evaluation Role",
123
+ "",
124
+ extra["evaluation_role"],
125
+ "",
126
+ "## DGP Summary",
127
+ "",
128
+ f"- DGP family: `{dgp['name']}`",
129
+ "- Default score: Aitchison distance",
130
+ f"- Default stratification: `{evaluation['strata_method']}` with `{evaluation['n_strata']}` strata",
131
+ f"- Calibration size: `{data['n_cal']}`",
132
+ f"- Test size: `{data['n_test']}`",
133
+ f"- Repetitions: `{data['n_rep']}`",
134
+ "",
135
+ "## Release Contents",
136
+ "",
137
+ bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
138
+ "",
139
+ "## Provenance And Rebuild",
140
+ "",
141
+ f"- Source asset: {metadata['source_asset']}",
142
+ f"- Config file: `{metadata['config_file']}`",
143
+ f"- Redistribution: `{metadata['redistribution']}`",
144
+ f"- Seed: `{metadata['seed']}`",
145
+ "",
146
+ "## Limitations",
147
+ "",
148
+ extra["limitations"],
149
+ ]
150
+ return "\n".join(lines)
151
+
152
+
153
+ def real_task_card(task_dir: Path, metadata: dict, extra: dict) -> str:
154
+ lines = [
155
+ f"# {metadata['task_name']} Task Card",
156
+ "",
157
+ f"- Task ID: `{metadata['task_id']}`",
158
+ "- Subset: real",
159
+ f"- Samples: `{metadata['n_samples']}`",
160
+ f"- Simplex dimension: `{metadata['simplex_dim']}`",
161
+ f"- Predictor: {metadata['predictor']}",
162
+ "",
163
+ "## Evaluation Role",
164
+ "",
165
+ extra["evaluation_role"],
166
+ "",
167
+ "## Target And Predictor",
168
+ "",
169
+ extra["target_definition"],
170
+ "",
171
+ "## Default Benchmark Settings",
172
+ "",
173
+ f"- Default score: {extra['default_score']}",
174
+ f"- Default stratification: {extra['default_stratification']}",
175
+ f"- Redistribution: `{metadata['redistribution']}`",
176
+ "",
177
+ "## Release Contents",
178
+ "",
179
+ bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
180
+ "",
181
+ "## Provenance And Usage Notes",
182
+ "",
183
+ f"- Source asset: {metadata['source_asset']}",
184
+ f"- Metadata note: {metadata['notes']}",
185
+ "",
186
+ "## Limitations",
187
+ "",
188
+ extra["limitations"],
189
+ ]
190
+ return "\n".join(lines)
191
+
192
+
193
+ def build_task_cards() -> None:
194
+ for task_dir in sorted((RELEASE_ROOT / "real").glob("*")):
195
+ metadata = load_json(task_dir / "metadata.json")
196
+ extra = REAL_EXTRAS[metadata["task_id"]]
197
+ write(task_dir / "task_card.md", real_task_card(task_dir, metadata, extra))
198
+
199
+ for task_dir in sorted((RELEASE_ROOT / "synthetic").glob("*")):
200
+ metadata = load_json(task_dir / "metadata.json")
201
+ config = load_yaml(task_dir / "config.yaml")
202
+ extra = SYNTH_EXTRAS[metadata["task_id"]]
203
+ write(task_dir / "task_card.md", synthetic_task_card(task_dir, metadata, config, extra))
204
+
205
+
206
+ def build_docs() -> None:
207
+ benchmark_card = dedent(
208
+ """
209
+ # SimplexTasks-12 Benchmark Card
210
+
211
+ SimplexTasks-12 is the processed task collection behind the SimplexUQ benchmark. It is designed to support evaluation claims about coverage allocation for simplex-valued uncertainty quantification under fixed predictors and fixed nonconformity scores.
212
+
213
+ ## Supported Claims
214
+
215
+ - Whether a fixed simplex-valued predictor exhibits allocation failure beyond its marginal coverage.
216
+ - Which heterogeneity regime best describes the observed failure pattern.
217
+ - Which conformal wrapper family is most competitive under the chosen task and stratification protocol.
218
+
219
+ ## Claims The Benchmark Does Not Support
220
+
221
+ - Universal wrapper rankings across all simplex tasks.
222
+ - Deployment-readiness claims for any predictor.
223
+ - Raw-data redistribution rights beyond the terms documented in `LICENSE_NOTES.md`.
224
+
225
+ ## Benchmark Contents
226
+
227
+ - 6 synthetic regimes with canonical `task.npz` bundles and copied YAML configs.
228
+ - 6 fixed-predictor real tasks with cached `(Y, U)` arrays or derived features.
229
+ - Per-task `task_card.md` files and `metadata.json` provenance records.
230
+ - Release-level rebuild instructions for the paper tables and figures.
231
+
232
+ ## Reproducibility Contract
233
+
234
+ - Benchmark evaluation always operates on frozen predictor outputs.
235
+ - Default stratification rules are fixed before wrapper comparison.
236
+ - Restricted raw assets are replaced by derived arrays plus rebuild notes.
237
+ - The paper figures and tables can be regenerated from cached outputs with the commands in the release `README.md`.
238
+
239
+ ## Responsible Use
240
+
241
+ Some tasks rely on non-redistributable source assets or derived features from sensitive domains such as face images. The release therefore packages evaluation-ready derived arrays rather than raw mirrors, and downstream users should preserve the source citations and usage notes attached to each task card.
242
+ """
243
+ ).strip()
244
+
245
+ evaluation_protocol = dedent(
246
+ """
247
+ # SimplexTasks-12 Evaluation Protocol
248
+
249
+ ## Fixed-Predictor Principle
250
+
251
+ Every benchmark slice fixes the predictor first and then varies only the conformal wrapper. This keeps the evaluation target on uncertainty allocation rather than on predictor training.
252
+
253
+ ## Default Scores
254
+
255
+ - Real and synthetic composition tasks use Aitchison distance unless the target lies on simplex vertices.
256
+ - CIFAR-10 uses total variation / L1 because Aitchison distance is ill-defined at the one-hot boundary.
257
+
258
+ ## Stratification Rules
259
+
260
+ - Default strata are entropy bins, boundary bins, or task-specific dominant-group partitions.
261
+ - Sensitivity sweeps use fixed alternative stratifications defined from cached prediction vectors only.
262
+ - Stratification maps are not tuned per wrapper and do not depend on calibration/test responses.
263
+
264
+ ## Main Metrics
265
+
266
+ - Marginal coverage.
267
+ - Max disparity across prediction-space strata.
268
+ - Worst-stratum coverage.
269
+ - Coverage variance.
270
+ - SSCV and mean radius; low-dimensional synthetic tasks also report a simplex-volume ratio.
271
+
272
+ ## Wrapper Families
273
+
274
+ - Global split conformal.
275
+ - Group-wise / Mondrian conformal.
276
+ - Two-stage normalization.
277
+ - Exact or leave-one-out references where affordable.
278
+ - Diagnostic variants such as OneShot, TrainRes, and the current weighted implementation.
279
+
280
+ ## Output Interpretation
281
+
282
+ The benchmark is designed to compare wrapper families under visible allocation-efficiency-compute tradeoffs. It should not be reduced to a single leaderboard or read as a conditional-coverage certification protocol.
283
+ """
284
+ ).strip()
285
+
286
+ write(DOCS_DIR / "benchmark_card.md", benchmark_card)
287
+ write(DOCS_DIR / "evaluation_protocol.md", evaluation_protocol)
288
+
289
+
290
+ def main() -> None:
291
+ build_task_cards()
292
+ build_docs()
293
+ print("Built SimplexTasks-12 task cards and docs.")
294
+
295
+
296
+ if __name__ == "__main__":
297
+ main()
scripts/cache_affective_text_open_predictions.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build a fully open AffectiveText prediction cache with out-of-fold regressors."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ from sklearn.decomposition import TruncatedSVD
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.model_selection import KFold
13
+ from sklearn.neighbors import KNeighborsRegressor
14
+ from sklearn.preprocessing import Normalizer
15
+
16
+ import sys
17
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
18
+
19
+ from src.data import EMOTION_NAMES, load_affective_text
20
+
21
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
22
+ log = logging.getLogger(__name__)
23
+
24
+
25
+ def macro_pearson(a: np.ndarray, b: np.ndarray) -> float:
26
+ vals = []
27
+ for j in range(a.shape[1]):
28
+ aj = a[:, j]
29
+ bj = b[:, j]
30
+ if np.std(aj) <= 1e-12 or np.std(bj) <= 1e-12:
31
+ continue
32
+ vals.append(float(np.corrcoef(aj, bj)[0, 1]))
33
+ return float(np.mean(vals)) if vals else float("nan")
34
+
35
+
36
+ def fit_predict_fold(
37
+ train_texts: list[str],
38
+ test_texts: list[str],
39
+ train_targets: np.ndarray,
40
+ n_components: int,
41
+ n_neighbors: int,
42
+ ) -> np.ndarray:
43
+ vectorizer = TfidfVectorizer(
44
+ lowercase=True,
45
+ strip_accents="unicode",
46
+ sublinear_tf=True,
47
+ ngram_range=(1, 2),
48
+ min_df=1,
49
+ max_df=0.95,
50
+ stop_words="english",
51
+ )
52
+ x_train = vectorizer.fit_transform(train_texts)
53
+ x_test = vectorizer.transform(test_texts)
54
+
55
+ max_rank = min(x_train.shape[0] - 1, x_train.shape[1] - 1)
56
+ if max_rank >= 2:
57
+ rank = min(n_components, max_rank)
58
+ svd = TruncatedSVD(n_components=rank, random_state=0)
59
+ normalizer = Normalizer(copy=False)
60
+ x_train = normalizer.fit_transform(svd.fit_transform(x_train))
61
+ x_test = normalizer.transform(svd.transform(x_test))
62
+ else:
63
+ x_train = x_train.toarray()
64
+ x_test = x_test.toarray()
65
+
66
+ knn = KNeighborsRegressor(
67
+ n_neighbors=min(n_neighbors, len(train_texts)),
68
+ weights="distance",
69
+ metric="minkowski",
70
+ p=2,
71
+ )
72
+ knn.fit(x_train, train_targets)
73
+ return np.asarray(knn.predict(x_test), dtype=float)
74
+
75
+
76
+ def build_open_predictions(
77
+ headlines: list[str],
78
+ raw_scores: np.ndarray,
79
+ n_splits: int,
80
+ n_components: int,
81
+ n_neighbors: int,
82
+ seed: int,
83
+ ) -> tuple[np.ndarray, np.ndarray]:
84
+ n = len(headlines)
85
+ preds = np.zeros_like(raw_scores, dtype=float)
86
+ folds = np.full(n, -1, dtype=int)
87
+ splitter = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
88
+ global_mean = raw_scores.mean(axis=0)
89
+
90
+ for fold_id, (train_idx, test_idx) in enumerate(splitter.split(headlines)):
91
+ train_texts = [headlines[i] for i in train_idx]
92
+ test_texts = [headlines[i] for i in test_idx]
93
+ train_targets = raw_scores[train_idx]
94
+ fold_preds = fit_predict_fold(
95
+ train_texts=train_texts,
96
+ test_texts=test_texts,
97
+ train_targets=train_targets,
98
+ n_components=n_components,
99
+ n_neighbors=n_neighbors,
100
+ )
101
+ fold_preds = np.clip(fold_preds, 0.0, None)
102
+ zero_rows = fold_preds.sum(axis=1) <= 1e-12
103
+ if np.any(zero_rows):
104
+ fold_preds[zero_rows] = global_mean
105
+ preds[test_idx] = fold_preds
106
+ folds[test_idx] = fold_id
107
+ log.info("Finished fold %d/%d", fold_id + 1, n_splits)
108
+
109
+ return preds, folds
110
+
111
+
112
+ def main() -> None:
113
+ parser = argparse.ArgumentParser()
114
+ parser.add_argument("--data-dir", default="data/raw/AffectiveText.Semeval.2007")
115
+ parser.add_argument("--output", default="data/processed/affective_text_open_oof_predictions.jsonl")
116
+ parser.add_argument("--n-splits", type=int, default=5)
117
+ parser.add_argument("--n-components", type=int, default=128)
118
+ parser.add_argument("--n-neighbors", type=int, default=25)
119
+ parser.add_argument("--seed", type=int, default=2026)
120
+ parser.add_argument("--limit", type=int, default=None)
121
+ parser.add_argument("--overwrite", action="store_true")
122
+ args = parser.parse_args()
123
+
124
+ output_path = Path(args.output)
125
+ if output_path.exists() and not args.overwrite:
126
+ raise FileExistsError(f"Output already exists: {output_path}")
127
+
128
+ data = load_affective_text(args.data_dir)
129
+ ids = data["ids"]
130
+ headlines = data["headlines"]
131
+ raw_scores = np.asarray(data["raw_scores"], dtype=float)
132
+ if args.limit is not None:
133
+ ids = ids[:args.limit]
134
+ headlines = headlines[:args.limit]
135
+ raw_scores = raw_scores[:args.limit]
136
+
137
+ pred_scores, folds = build_open_predictions(
138
+ headlines=headlines,
139
+ raw_scores=raw_scores,
140
+ n_splits=args.n_splits,
141
+ n_components=args.n_components,
142
+ n_neighbors=args.n_neighbors,
143
+ seed=args.seed,
144
+ )
145
+
146
+ macro_r = macro_pearson(raw_scores, pred_scores)
147
+ flat_r = float(np.corrcoef(raw_scores.reshape(-1), pred_scores.reshape(-1))[0, 1])
148
+ log.info(
149
+ "Open fallback predictor quality: macro Pearson=%.3f, flattened Pearson=%.3f",
150
+ macro_r,
151
+ flat_r,
152
+ )
153
+
154
+ output_path.parent.mkdir(parents=True, exist_ok=True)
155
+ with open(output_path, "w", encoding="utf-8") as f:
156
+ for idx, headline, scores, fold_id in zip(ids, headlines, pred_scores, folds):
157
+ row = {
158
+ "id": idx,
159
+ "headline": headline,
160
+ "emotions": EMOTION_NAMES,
161
+ "scores": [float(x) for x in scores],
162
+ "provider": "open_fallback",
163
+ "model": "tfidf_svd_knn_oof",
164
+ "fold": int(fold_id),
165
+ "builder": {
166
+ "n_splits": int(args.n_splits),
167
+ "n_components": int(args.n_components),
168
+ "n_neighbors": int(args.n_neighbors),
169
+ "seed": int(args.seed),
170
+ },
171
+ "notes": "Deterministic out-of-fold TF-IDF+SVD+kNN regression fallback.",
172
+ }
173
+ f.write(json.dumps(row, ensure_ascii=True) + "\n")
174
+
175
+ log.info("Finished. Predictions cached at %s", output_path)
176
+
177
+
178
+ if __name__ == "__main__":
179
+ main()
scripts/cache_affective_text_predictions.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Cache zero-shot API emotion scores for SemEval-2007 Affective Text."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import logging
7
+ import os
8
+ import re
9
+ import time
10
+ import urllib.error
11
+ import urllib.parse
12
+ import urllib.request
13
+ from pathlib import Path
14
+
15
+ import sys
16
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
17
+
18
+ from src.data import EMOTION_NAMES, load_affective_text, load_prediction_cache
19
+
20
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
21
+ log = logging.getLogger(__name__)
22
+
23
+ PROMPT_TEMPLATE = (
24
+ 'Rate the following news headline on 6 emotions: anger, disgust, fear, joy, sadness, surprise. '
25
+ 'Return only 6 numbers from 0 to 100, comma-separated, in that order.\n'
26
+ 'Headline: "{headline}"\n'
27
+ "Scores:"
28
+ )
29
+
30
+
31
+ def parse_scores(text: str) -> list[float]:
32
+ nums = re.findall(r"-?\d+(?:\.\d+)?", text)
33
+ if len(nums) < 6:
34
+ raise ValueError(f"Could not parse 6 scores from response: {text!r}")
35
+ scores = [max(float(x), 0.0) for x in nums[:6]]
36
+ if sum(scores) <= 0:
37
+ raise ValueError(f"Parsed zero-sum scores from response: {text!r}")
38
+ return scores
39
+
40
+
41
+ def call_openai_chat_completions(
42
+ headline: str,
43
+ model: str,
44
+ api_key: str,
45
+ base_url: str,
46
+ timeout_sec: float,
47
+ ) -> tuple[str, dict]:
48
+ prompt = PROMPT_TEMPLATE.format(headline=headline)
49
+ payload = {
50
+ "model": model,
51
+ "messages": [
52
+ {"role": "system", "content": "You are a precise annotation model."},
53
+ {"role": "user", "content": prompt},
54
+ ],
55
+ "temperature": 0,
56
+ }
57
+ req = urllib.request.Request(
58
+ url=base_url.rstrip("/") + "/chat/completions",
59
+ data=json.dumps(payload).encode("utf-8"),
60
+ headers={
61
+ "Content-Type": "application/json",
62
+ "Authorization": f"Bearer {api_key}",
63
+ },
64
+ method="POST",
65
+ )
66
+ with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
67
+ body = json.loads(resp.read().decode("utf-8"))
68
+ text = body["choices"][0]["message"]["content"]
69
+ return text, body
70
+
71
+
72
+ def call_gemini_generate_content(
73
+ headline: str,
74
+ model: str,
75
+ api_key: str,
76
+ base_url: str,
77
+ timeout_sec: float,
78
+ ) -> tuple[str, dict]:
79
+ prompt = PROMPT_TEMPLATE.format(headline=headline)
80
+ payload = {
81
+ "contents": [
82
+ {
83
+ "role": "user",
84
+ "parts": [{"text": prompt}],
85
+ }
86
+ ],
87
+ "generationConfig": {
88
+ "temperature": 0,
89
+ },
90
+ }
91
+ url = (
92
+ base_url.rstrip("/")
93
+ + f"/models/{model}:generateContent?key={urllib.parse.quote(api_key)}"
94
+ )
95
+ req = urllib.request.Request(
96
+ url=url,
97
+ data=json.dumps(payload).encode("utf-8"),
98
+ headers={"Content-Type": "application/json"},
99
+ method="POST",
100
+ )
101
+ with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
102
+ body = json.loads(resp.read().decode("utf-8"))
103
+ candidates = body.get("candidates", [])
104
+ if not candidates:
105
+ raise KeyError(f"No Gemini candidates in response: {body}")
106
+ parts = candidates[0].get("content", {}).get("parts", [])
107
+ text = "\n".join(part.get("text", "") for part in parts if part.get("text"))
108
+ if not text:
109
+ raise KeyError(f"No text parts in Gemini response: {body}")
110
+ return text, body
111
+
112
+
113
+ def main():
114
+ parser = argparse.ArgumentParser()
115
+ parser.add_argument("--data-dir", default="data/raw/AffectiveText.Semeval.2007")
116
+ parser.add_argument("--output", default="data/processed/affective_text_predictions.jsonl")
117
+ parser.add_argument("--provider", choices=["openai", "gemini"], default="gemini")
118
+ parser.add_argument("--model", default=None)
119
+ parser.add_argument("--base-url", default=None)
120
+ parser.add_argument("--api-key-env", default=None)
121
+ parser.add_argument("--limit", type=int, default=None)
122
+ parser.add_argument("--sleep-sec", type=float, default=0.0)
123
+ parser.add_argument("--timeout-sec", type=float, default=60.0)
124
+ parser.add_argument("--overwrite", action="store_true")
125
+ args = parser.parse_args()
126
+
127
+ if args.model is None:
128
+ if args.provider == "gemini":
129
+ args.model = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001")
130
+ else:
131
+ args.model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini-2024-07-18")
132
+ if args.base_url is None:
133
+ if args.provider == "gemini":
134
+ args.base_url = os.environ.get("GEMINI_BASE_URL", "https://generativelanguage.googleapis.com/v1beta")
135
+ else:
136
+ args.base_url = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
137
+ if args.api_key_env is None:
138
+ args.api_key_env = "GEMINI_API_KEY" if args.provider == "gemini" else "OPENAI_API_KEY"
139
+
140
+ api_key = os.environ.get(args.api_key_env)
141
+ if not api_key:
142
+ raise EnvironmentError(f"Missing API key in env var {args.api_key_env}")
143
+
144
+ data = load_affective_text(args.data_dir)
145
+ ids = data["ids"]
146
+ headlines = data["headlines"]
147
+ if args.limit is not None:
148
+ ids = ids[:args.limit]
149
+ headlines = headlines[:args.limit]
150
+
151
+ out_path = Path(args.output)
152
+ out_path.parent.mkdir(parents=True, exist_ok=True)
153
+ existing = {}
154
+ if out_path.exists() and not args.overwrite:
155
+ existing = load_prediction_cache(out_path)
156
+ log.info(f"Loaded {len(existing)} cached predictions from {out_path}")
157
+
158
+ n_done = 0
159
+ with open(out_path, "a" if existing and not args.overwrite else "w", encoding="utf-8") as f:
160
+ for idx, headline in zip(ids, headlines):
161
+ if idx in existing and not args.overwrite:
162
+ continue
163
+ try:
164
+ if args.provider == "gemini":
165
+ raw_text, raw_json = call_gemini_generate_content(
166
+ headline=headline,
167
+ model=args.model,
168
+ api_key=api_key,
169
+ base_url=args.base_url,
170
+ timeout_sec=args.timeout_sec,
171
+ )
172
+ else:
173
+ raw_text, raw_json = call_openai_chat_completions(
174
+ headline=headline,
175
+ model=args.model,
176
+ api_key=api_key,
177
+ base_url=args.base_url,
178
+ timeout_sec=args.timeout_sec,
179
+ )
180
+ scores = parse_scores(raw_text)
181
+ except (urllib.error.URLError, urllib.error.HTTPError, ValueError, KeyError) as exc:
182
+ log.error(f"Failed on id={idx}: {exc}")
183
+ continue
184
+
185
+ row = {
186
+ "id": idx,
187
+ "headline": headline,
188
+ "emotions": EMOTION_NAMES,
189
+ "scores": scores,
190
+ "provider": args.provider,
191
+ "model": args.model,
192
+ "base_url": args.base_url,
193
+ "prompt_template": PROMPT_TEMPLATE,
194
+ "raw_text": raw_text,
195
+ "raw_response": raw_json,
196
+ }
197
+ f.write(json.dumps(row, ensure_ascii=True) + "\n")
198
+ f.flush()
199
+ n_done += 1
200
+ if n_done % 50 == 0:
201
+ log.info(f"Cached {n_done} new predictions")
202
+ if args.sleep_sec > 0:
203
+ time.sleep(args.sleep_sec)
204
+
205
+ log.info(f"Finished. Predictions cached at {out_path}")
206
+
207
+
208
+ if __name__ == "__main__":
209
+ main()