Initial anonymous code release
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- CITATION.cff +14 -0
- LICENSE +6 -0
- README.md +20 -0
- configs/methods/fullcp.yaml +4 -0
- configs/methods/global.yaml +4 -0
- configs/methods/jackknife_plus.yaml +4 -0
- configs/methods/oneshot.yaml +4 -0
- configs/methods/partition.yaml +4 -0
- configs/methods/trainres.yaml +5 -0
- configs/methods/twostage.yaml +4 -0
- configs/methods/weighted.yaml +4 -0
- configs/real/affectivetext.yaml +6 -0
- configs/real/cifar10_softmax.yaml +6 -0
- configs/real/pbmc_pseudobulk.yaml +6 -0
- configs/real/samson_unmixing.yaml +6 -0
- configs/real/topics_20newsgroups.yaml +6 -0
- configs/real/utkface_ldl.yaml +6 -0
- configs/synthetic/D1.yaml +35 -0
- configs/synthetic/D2.yaml +37 -0
- configs/synthetic/D3.yaml +36 -0
- configs/synthetic/D4.yaml +39 -0
- configs/synthetic/D5.yaml +36 -0
- configs/synthetic/D6.yaml +33 -0
- docs/faq.md +14 -0
- docs/release_contract.md +7 -0
- docs/restricted_assets.md +12 -0
- docs/reviewer_quickstart.md +19 -0
- docs/task_limitations.md +7 -0
- environment.yml +14 -0
- outputs/README.md +1 -0
- pyproject.toml +28 -0
- rebuild/affectivetext/README.md +3 -0
- rebuild/affectivetext/cache_affective_text_open_predictions.py +179 -0
- rebuild/affectivetext/rebuild_gold_labels.py +6 -0
- rebuild/affectivetext/validate_cache_schema.py +11 -0
- rebuild/cifar10/README.md +3 -0
- rebuild/cifar10/rebuild_from_torchvision.py +3 -0
- rebuild/pbmc/README.md +3 -0
- rebuild/pbmc/generate_pseudobulk.py +3 -0
- rebuild/pbmc/rebuild_from_pbmc3k.py +3 -0
- rebuild/samson/README.md +3 -0
- rebuild/samson/rebuild_from_public_bundle.py +3 -0
- rebuild/topics/README.md +3 -0
- rebuild/topics/rebuild_from_sklearn_fetcher.py +3 -0
- rebuild/utkface/README.md +3 -0
- rebuild/utkface/rebuild_from_utkface.py +3 -0
- requirements.txt +8 -0
- scripts/build_simplextasks_docs.py +297 -0
- scripts/cache_affective_text_open_predictions.py +179 -0
- scripts/cache_affective_text_predictions.py +209 -0
CITATION.cff
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cff-version: 1.2.0
|
| 2 |
+
title: "SimplexUQ code artifact"
|
| 3 |
+
message: "If you use this benchmark code, please cite the accompanying benchmark paper."
|
| 4 |
+
type: software
|
| 5 |
+
version: 0.1.0
|
| 6 |
+
authors:
|
| 7 |
+
- family-names: "Authors"
|
| 8 |
+
given-names: "Anonymous"
|
| 9 |
+
abstract: "Executable code artifact for reproducing the SimplexUQ benchmark figures and tables from frozen derived arrays."
|
| 10 |
+
keywords:
|
| 11 |
+
- simplex
|
| 12 |
+
- conformal prediction
|
| 13 |
+
- benchmark
|
| 14 |
+
license: "other"
|
LICENSE
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Code Artifact License Notice
|
| 2 |
+
|
| 3 |
+
This anonymous code bundle is provided for NeurIPS E&D review and benchmark
|
| 4 |
+
reproduction. It does not grant rights to redistribute restricted source
|
| 5 |
+
datasets or raw API outputs. Use the code together with the provenance and
|
| 6 |
+
restricted-asset notes shipped in the paired SimplexTasks-12 data bundle.
|
README.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# simplexuq-code
|
| 2 |
+
|
| 3 |
+
Anonymous code bundle for the SimplexUQ benchmark.
|
| 4 |
+
|
| 5 |
+
This repository is meant to be paired with the `SimplexTasks-12-data` dataset
|
| 6 |
+
artifact. It contains:
|
| 7 |
+
|
| 8 |
+
- `src/` benchmark logic and utility code
|
| 9 |
+
- `scripts/` benchmark runners and figure/table reproducers
|
| 10 |
+
- `rebuild/` task-specific rebuild notes for restricted assets
|
| 11 |
+
- `configs/` synthetic, real, and method configuration files
|
| 12 |
+
- `docs/` reviewer-facing quickstart and release notes
|
| 13 |
+
|
| 14 |
+
Typical usage:
|
| 15 |
+
|
| 16 |
+
```bash
|
| 17 |
+
python scripts/check_artifact_integrity.py
|
| 18 |
+
python scripts/reproduce_tables.py
|
| 19 |
+
python scripts/reproduce_figures.py
|
| 20 |
+
```
|
configs/methods/fullcp.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
method: fullcp
|
| 2 |
+
family: exact / local-scale reference
|
| 3 |
+
paper_label: FullCP
|
| 4 |
+
validity: Exact marginal, expensive in large settings
|
configs/methods/global.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
method: global
|
| 2 |
+
family: split
|
| 3 |
+
paper_label: Global
|
| 4 |
+
validity: Exact marginal under exchangeability
|
configs/methods/jackknife_plus.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
method: jackknife_plus
|
| 2 |
+
family: leave-one-out reference
|
| 3 |
+
paper_label: Jackknife+
|
| 4 |
+
validity: Approximate or exact depending on setting; used as a reference in this benchmark
|
configs/methods/oneshot.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
method: oneshot
|
| 2 |
+
family: diagnostic normalization
|
| 3 |
+
paper_label: OneShot
|
| 4 |
+
validity: No general exchangeability guarantee
|
configs/methods/partition.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
method: partition
|
| 2 |
+
family: group-wise
|
| 3 |
+
paper_label: Mondrian
|
| 4 |
+
validity: Exact within fixed groups
|
configs/methods/trainres.yaml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
method: trainres
|
| 2 |
+
family: training-residual normalization
|
| 3 |
+
paper_label: TrainRes
|
| 4 |
+
validity: Can retain marginal validity under strong conditions but may misallocate
|
| 5 |
+
badly
|
configs/methods/twostage.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
method: twostage
|
| 2 |
+
family: normalized split
|
| 3 |
+
paper_label: TwoStage
|
| 4 |
+
validity: Exact marginal when the scale fit is independent
|
configs/methods/weighted.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
method: weighted
|
| 2 |
+
family: weighted conformal diagnostic
|
| 3 |
+
paper_label: Weighted
|
| 4 |
+
validity: Implementation-specific diagnostic only in this benchmark
|
configs/real/affectivetext.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task_id: affectivetext_emotions
|
| 2 |
+
default_score: aitchison
|
| 3 |
+
default_stratification: boundary
|
| 4 |
+
alpha: 0.1
|
| 5 |
+
repetitions: 200
|
| 6 |
+
benchmark_mode: fixed_predictor
|
configs/real/cifar10_softmax.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task_id: cifar10_softmax
|
| 2 |
+
default_score: tv
|
| 3 |
+
default_stratification: entropy
|
| 4 |
+
alpha: 0.1
|
| 5 |
+
repetitions: 50
|
| 6 |
+
benchmark_mode: fixed_predictor
|
configs/real/pbmc_pseudobulk.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task_id: pbmc3k_pseudobulk
|
| 2 |
+
default_score: aitchison
|
| 3 |
+
default_stratification: boundary
|
| 4 |
+
alpha: 0.1
|
| 5 |
+
repetitions: 200
|
| 6 |
+
benchmark_mode: fixed_predictor
|
configs/real/samson_unmixing.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task_id: samson_unmixing
|
| 2 |
+
default_score: aitchison
|
| 3 |
+
default_stratification: boundary
|
| 4 |
+
alpha: 0.1
|
| 5 |
+
repetitions: 50
|
| 6 |
+
benchmark_mode: fixed_predictor
|
configs/real/topics_20newsgroups.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task_id: topics_20ng
|
| 2 |
+
default_score: aitchison
|
| 3 |
+
default_stratification: entropy
|
| 4 |
+
alpha: 0.1
|
| 5 |
+
repetitions: 50
|
| 6 |
+
benchmark_mode: fixed_predictor
|
configs/real/utkface_ldl.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task_id: utkface_age_ldl
|
| 2 |
+
default_score: aitchison
|
| 3 |
+
default_stratification: entropy
|
| 4 |
+
alpha: 0.1
|
| 5 |
+
repetitions: 50
|
| 6 |
+
benchmark_mode: fixed_predictor
|
configs/synthetic/D1.yaml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment: d1_homogeneous
|
| 2 |
+
|
| 3 |
+
dgp:
|
| 4 |
+
name: pure_scale
|
| 5 |
+
K: 3
|
| 6 |
+
sigma_min: 0.2
|
| 7 |
+
c: 0.0
|
| 8 |
+
d_x: 2
|
| 9 |
+
|
| 10 |
+
data:
|
| 11 |
+
n_train: 1000
|
| 12 |
+
n_cal: 1000
|
| 13 |
+
n_scale_est: 500
|
| 14 |
+
n_test: 5000
|
| 15 |
+
n_rep: 200
|
| 16 |
+
|
| 17 |
+
methods:
|
| 18 |
+
- global
|
| 19 |
+
- partition
|
| 20 |
+
- twostage
|
| 21 |
+
- oneshot
|
| 22 |
+
- trainres
|
| 23 |
+
- weighted
|
| 24 |
+
- oracle
|
| 25 |
+
|
| 26 |
+
evaluation:
|
| 27 |
+
alpha: 0.1
|
| 28 |
+
strata_method: boundary
|
| 29 |
+
n_strata: 5
|
| 30 |
+
|
| 31 |
+
weighting:
|
| 32 |
+
mode: inverse_sigma
|
| 33 |
+
source: knn_loo
|
| 34 |
+
|
| 35 |
+
seed: 2026
|
configs/synthetic/D2.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment: d2_pure_scale
|
| 2 |
+
|
| 3 |
+
dgp:
|
| 4 |
+
name: pure_scale
|
| 5 |
+
K: 3
|
| 6 |
+
sigma_min: 0.1
|
| 7 |
+
c: 0.5
|
| 8 |
+
d_x: 2
|
| 9 |
+
|
| 10 |
+
data:
|
| 11 |
+
n_train: 500
|
| 12 |
+
n_cal: 500
|
| 13 |
+
n_scale_est: 250
|
| 14 |
+
n_test: 5000
|
| 15 |
+
n_rep: 200
|
| 16 |
+
|
| 17 |
+
methods:
|
| 18 |
+
- global
|
| 19 |
+
- fullcp
|
| 20 |
+
- jackknife_plus
|
| 21 |
+
- partition
|
| 22 |
+
- twostage
|
| 23 |
+
- oneshot
|
| 24 |
+
- trainres
|
| 25 |
+
- weighted
|
| 26 |
+
- oracle
|
| 27 |
+
|
| 28 |
+
evaluation:
|
| 29 |
+
alpha: 0.1
|
| 30 |
+
strata_method: boundary
|
| 31 |
+
n_strata: 5
|
| 32 |
+
|
| 33 |
+
weighting:
|
| 34 |
+
mode: inverse_sigma
|
| 35 |
+
source: knn_loo
|
| 36 |
+
|
| 37 |
+
seed: 2026
|
configs/synthetic/D3.yaml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment: d3_discrete_groups_aligned
|
| 2 |
+
|
| 3 |
+
dgp:
|
| 4 |
+
name: discrete_groups
|
| 5 |
+
K: 10
|
| 6 |
+
sigma_low: 0.08
|
| 7 |
+
sigma_high: 0.30
|
| 8 |
+
d_x: 5
|
| 9 |
+
easy_classes: 5
|
| 10 |
+
|
| 11 |
+
data:
|
| 12 |
+
n_train: 500
|
| 13 |
+
n_cal: 500
|
| 14 |
+
n_scale_est: 250
|
| 15 |
+
n_test: 5000
|
| 16 |
+
n_rep: 200
|
| 17 |
+
|
| 18 |
+
methods:
|
| 19 |
+
- global
|
| 20 |
+
- partition
|
| 21 |
+
- twostage
|
| 22 |
+
- fullcp
|
| 23 |
+
- jackknife_plus
|
| 24 |
+
- oracle
|
| 25 |
+
|
| 26 |
+
evaluation:
|
| 27 |
+
alpha: 0.1
|
| 28 |
+
strata_method: argmax_group
|
| 29 |
+
n_strata: 2
|
| 30 |
+
split_index: 5
|
| 31 |
+
|
| 32 |
+
weighting:
|
| 33 |
+
mode: inverse_sigma
|
| 34 |
+
source: knn_loo
|
| 35 |
+
|
| 36 |
+
seed: 2026
|
configs/synthetic/D4.yaml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment: d4_model_bias
|
| 2 |
+
|
| 3 |
+
dgp:
|
| 4 |
+
name: model_bias
|
| 5 |
+
K: 3
|
| 6 |
+
sigma_min: 0.1
|
| 7 |
+
c: 0.15
|
| 8 |
+
d_x: 2
|
| 9 |
+
bias_scale: 0.45
|
| 10 |
+
bias_type: rotational
|
| 11 |
+
|
| 12 |
+
data:
|
| 13 |
+
n_train: 500
|
| 14 |
+
n_cal: 500
|
| 15 |
+
n_scale_est: 250
|
| 16 |
+
n_test: 5000
|
| 17 |
+
n_rep: 200
|
| 18 |
+
|
| 19 |
+
methods:
|
| 20 |
+
- global
|
| 21 |
+
- fullcp
|
| 22 |
+
- jackknife_plus
|
| 23 |
+
- partition
|
| 24 |
+
- twostage
|
| 25 |
+
- oneshot
|
| 26 |
+
- trainres
|
| 27 |
+
- weighted
|
| 28 |
+
- oracle
|
| 29 |
+
|
| 30 |
+
evaluation:
|
| 31 |
+
alpha: 0.1
|
| 32 |
+
strata_method: boundary
|
| 33 |
+
n_strata: 5
|
| 34 |
+
|
| 35 |
+
weighting:
|
| 36 |
+
mode: inverse_sigma
|
| 37 |
+
source: knn_loo
|
| 38 |
+
|
| 39 |
+
seed: 2026
|
configs/synthetic/D5.yaml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment: d5_heavy_tail
|
| 2 |
+
|
| 3 |
+
dgp:
|
| 4 |
+
name: heavy_tail
|
| 5 |
+
K: 3
|
| 6 |
+
sigma_min: 0.1
|
| 7 |
+
c: 0.5
|
| 8 |
+
d_x: 2
|
| 9 |
+
df: 3.0
|
| 10 |
+
|
| 11 |
+
data:
|
| 12 |
+
n_train: 500
|
| 13 |
+
n_cal: 500
|
| 14 |
+
n_scale_est: 250
|
| 15 |
+
n_test: 5000
|
| 16 |
+
n_rep: 200
|
| 17 |
+
|
| 18 |
+
methods:
|
| 19 |
+
- global
|
| 20 |
+
- fullcp
|
| 21 |
+
- jackknife_plus
|
| 22 |
+
- partition
|
| 23 |
+
- twostage
|
| 24 |
+
- weighted
|
| 25 |
+
- oracle
|
| 26 |
+
|
| 27 |
+
evaluation:
|
| 28 |
+
alpha: 0.1
|
| 29 |
+
strata_method: boundary
|
| 30 |
+
n_strata: 5
|
| 31 |
+
|
| 32 |
+
weighting:
|
| 33 |
+
mode: inverse_sigma
|
| 34 |
+
source: knn_loo
|
| 35 |
+
|
| 36 |
+
seed: 2026
|
configs/synthetic/D6.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment: d6_high_k
|
| 2 |
+
|
| 3 |
+
dgp:
|
| 4 |
+
name: high_k
|
| 5 |
+
K: 50
|
| 6 |
+
sigma_min: 0.05
|
| 7 |
+
c: 0.35
|
| 8 |
+
d_x: 10
|
| 9 |
+
|
| 10 |
+
data:
|
| 11 |
+
n_train: 5000
|
| 12 |
+
n_cal: 5000
|
| 13 |
+
n_scale_est: 2500
|
| 14 |
+
n_test: 5000
|
| 15 |
+
n_rep: 200
|
| 16 |
+
|
| 17 |
+
methods:
|
| 18 |
+
- global
|
| 19 |
+
- partition
|
| 20 |
+
- twostage
|
| 21 |
+
- weighted
|
| 22 |
+
- oracle
|
| 23 |
+
|
| 24 |
+
evaluation:
|
| 25 |
+
alpha: 0.1
|
| 26 |
+
strata_method: entropy
|
| 27 |
+
n_strata: 5
|
| 28 |
+
|
| 29 |
+
weighting:
|
| 30 |
+
mode: inverse_sigma
|
| 31 |
+
source: knn_loo
|
| 32 |
+
|
| 33 |
+
seed: 2026
|
docs/faq.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FAQ
|
| 2 |
+
|
| 3 |
+
## Why are there no raw images or raw headlines here?
|
| 4 |
+
Because this artifact is evaluation-first and respects source-asset terms. The
|
| 5 |
+
benchmark runs on frozen derived arrays and rebuild metadata instead of mirroring
|
| 6 |
+
restricted raw assets.
|
| 7 |
+
|
| 8 |
+
## Why are there two upload bundles?
|
| 9 |
+
Splitting data and code keeps the dataset artifact clean and reduces ambiguity
|
| 10 |
+
about what counts as the benchmark state versus what counts as execution logic.
|
| 11 |
+
|
| 12 |
+
## What should a reviewer run first?
|
| 13 |
+
The code bundle's figure/table reproduction helpers. They operate on frozen
|
| 14 |
+
derived arrays and are the shortest path to the paper outputs.
|
docs/release_contract.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Release Contract
|
| 2 |
+
|
| 3 |
+
This code bundle assumes that benchmark evaluation is run on frozen derived arrays.
|
| 4 |
+
It does not require raw-asset mirrors for the paper-level reproduction path.
|
| 5 |
+
|
| 6 |
+
The `rebuild/` directories are only for tasks whose source assets are restricted
|
| 7 |
+
or inconvenient to redistribute directly.
|
docs/restricted_assets.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Restricted Assets
|
| 2 |
+
|
| 3 |
+
The following raw assets are intentionally excluded from the data bundle:
|
| 4 |
+
|
| 5 |
+
- CIFAR-10 image archive
|
| 6 |
+
- UTKFace face-image archive
|
| 7 |
+
- Raw AffectiveText headlines
|
| 8 |
+
- Raw AffectiveText API responses
|
| 9 |
+
|
| 10 |
+
Rebuild instructions and metadata are provided instead. The benchmark runner is
|
| 11 |
+
designed to consume frozen derived arrays, so raw mirrors are not required for
|
| 12 |
+
the paper-level reproducibility path.
|
docs/reviewer_quickstart.md
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Reviewer Quickstart
|
| 2 |
+
|
| 3 |
+
1. Place or symlink the `SimplexTasks-12-data` bundle next to this code bundle.
|
| 4 |
+
2. Create an environment from `environment.yml` or install the packages listed in
|
| 5 |
+
`requirements.txt`.
|
| 6 |
+
3. Verify the two-bundle layout:
|
| 7 |
+
|
| 8 |
+
```bash
|
| 9 |
+
python scripts/check_artifact_integrity.py
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
4. Regenerate tables and figures from the frozen cached inputs:
|
| 13 |
+
|
| 14 |
+
```bash
|
| 15 |
+
python scripts/reproduce_tables.py
|
| 16 |
+
python scripts/reproduce_figures.py
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
5. Inspect `outputs/tables/` and `outputs/figures/`.
|
docs/task_limitations.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Task Limitations
|
| 2 |
+
|
| 3 |
+
- CIFAR-10 is a classification-style stress test, not a naturally continuous composition task.
|
| 4 |
+
- Topics is model-derived and should be read as topic-mixture evaluation rather than raw-label truth.
|
| 5 |
+
- PBMC is semi-synthetic and is intended as a control-style benchmark slice.
|
| 6 |
+
- UTKFace and AffectiveText rely on derived artifacts because the source assets are restricted.
|
| 7 |
+
- Samson is the cleanest natural low-dimensional composition task in the bundle.
|
environment.yml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: simplexuq-code
|
| 2 |
+
channels:
|
| 3 |
+
- conda-forge
|
| 4 |
+
dependencies:
|
| 5 |
+
- python=3.11
|
| 6 |
+
- numpy>=1.24
|
| 7 |
+
- scipy>=1.10
|
| 8 |
+
- scikit-learn>=1.3
|
| 9 |
+
- matplotlib>=3.7
|
| 10 |
+
- pyyaml>=6.0
|
| 11 |
+
- scanpy
|
| 12 |
+
- anndata
|
| 13 |
+
- rpy2
|
| 14 |
+
- pip
|
outputs/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Generated benchmark outputs go under this directory.
|
pyproject.toml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "simplexuq-code"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
requires-python = ">=3.10,<3.14"
|
| 5 |
+
dependencies = [
|
| 6 |
+
"numpy>=1.24",
|
| 7 |
+
"scipy>=1.10",
|
| 8 |
+
"scikit-learn>=1.3",
|
| 9 |
+
"matplotlib>=3.7",
|
| 10 |
+
"pyyaml>=6.0",
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
[project.optional-dependencies]
|
| 14 |
+
bio = ["scanpy", "anndata", "rpy2"] # for deconvolution experiments
|
| 15 |
+
r = ["rpy2"] # for R integration (visualization, scRNA analysis)
|
| 16 |
+
dev = ["pytest", "ruff", "ipykernel"]
|
| 17 |
+
gpu = ["torch>=2.0", "torchvision>=0.15"] # for CIFAR softmax experiment
|
| 18 |
+
|
| 19 |
+
[tool.setuptools.packages.find]
|
| 20 |
+
where = ["."]
|
| 21 |
+
include = ["src*"]
|
| 22 |
+
|
| 23 |
+
[tool.ruff]
|
| 24 |
+
line-length = 100
|
| 25 |
+
select = ["E", "F", "I"]
|
| 26 |
+
|
| 27 |
+
[tool.pytest.ini_options]
|
| 28 |
+
testpaths = ["tests"]
|
rebuild/affectivetext/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# affectivetext
|
| 2 |
+
|
| 3 |
+
Rebuild gold labels from the SemEval archive and use the open fallback cache-builder when API access is unavailable.
|
rebuild/affectivetext/cache_affective_text_open_predictions.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build a fully open AffectiveText prediction cache with out-of-fold regressors."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
from sklearn.decomposition import TruncatedSVD
|
| 11 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 12 |
+
from sklearn.model_selection import KFold
|
| 13 |
+
from sklearn.neighbors import KNeighborsRegressor
|
| 14 |
+
from sklearn.preprocessing import Normalizer
|
| 15 |
+
|
| 16 |
+
import sys
|
| 17 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 18 |
+
|
| 19 |
+
from src.data import EMOTION_NAMES, load_affective_text
|
| 20 |
+
|
| 21 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| 22 |
+
log = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def macro_pearson(a: np.ndarray, b: np.ndarray) -> float:
|
| 26 |
+
vals = []
|
| 27 |
+
for j in range(a.shape[1]):
|
| 28 |
+
aj = a[:, j]
|
| 29 |
+
bj = b[:, j]
|
| 30 |
+
if np.std(aj) <= 1e-12 or np.std(bj) <= 1e-12:
|
| 31 |
+
continue
|
| 32 |
+
vals.append(float(np.corrcoef(aj, bj)[0, 1]))
|
| 33 |
+
return float(np.mean(vals)) if vals else float("nan")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def fit_predict_fold(
|
| 37 |
+
train_texts: list[str],
|
| 38 |
+
test_texts: list[str],
|
| 39 |
+
train_targets: np.ndarray,
|
| 40 |
+
n_components: int,
|
| 41 |
+
n_neighbors: int,
|
| 42 |
+
) -> np.ndarray:
|
| 43 |
+
vectorizer = TfidfVectorizer(
|
| 44 |
+
lowercase=True,
|
| 45 |
+
strip_accents="unicode",
|
| 46 |
+
sublinear_tf=True,
|
| 47 |
+
ngram_range=(1, 2),
|
| 48 |
+
min_df=1,
|
| 49 |
+
max_df=0.95,
|
| 50 |
+
stop_words="english",
|
| 51 |
+
)
|
| 52 |
+
x_train = vectorizer.fit_transform(train_texts)
|
| 53 |
+
x_test = vectorizer.transform(test_texts)
|
| 54 |
+
|
| 55 |
+
max_rank = min(x_train.shape[0] - 1, x_train.shape[1] - 1)
|
| 56 |
+
if max_rank >= 2:
|
| 57 |
+
rank = min(n_components, max_rank)
|
| 58 |
+
svd = TruncatedSVD(n_components=rank, random_state=0)
|
| 59 |
+
normalizer = Normalizer(copy=False)
|
| 60 |
+
x_train = normalizer.fit_transform(svd.fit_transform(x_train))
|
| 61 |
+
x_test = normalizer.transform(svd.transform(x_test))
|
| 62 |
+
else:
|
| 63 |
+
x_train = x_train.toarray()
|
| 64 |
+
x_test = x_test.toarray()
|
| 65 |
+
|
| 66 |
+
knn = KNeighborsRegressor(
|
| 67 |
+
n_neighbors=min(n_neighbors, len(train_texts)),
|
| 68 |
+
weights="distance",
|
| 69 |
+
metric="minkowski",
|
| 70 |
+
p=2,
|
| 71 |
+
)
|
| 72 |
+
knn.fit(x_train, train_targets)
|
| 73 |
+
return np.asarray(knn.predict(x_test), dtype=float)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def build_open_predictions(
|
| 77 |
+
headlines: list[str],
|
| 78 |
+
raw_scores: np.ndarray,
|
| 79 |
+
n_splits: int,
|
| 80 |
+
n_components: int,
|
| 81 |
+
n_neighbors: int,
|
| 82 |
+
seed: int,
|
| 83 |
+
) -> tuple[np.ndarray, np.ndarray]:
|
| 84 |
+
n = len(headlines)
|
| 85 |
+
preds = np.zeros_like(raw_scores, dtype=float)
|
| 86 |
+
folds = np.full(n, -1, dtype=int)
|
| 87 |
+
splitter = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
|
| 88 |
+
global_mean = raw_scores.mean(axis=0)
|
| 89 |
+
|
| 90 |
+
for fold_id, (train_idx, test_idx) in enumerate(splitter.split(headlines)):
|
| 91 |
+
train_texts = [headlines[i] for i in train_idx]
|
| 92 |
+
test_texts = [headlines[i] for i in test_idx]
|
| 93 |
+
train_targets = raw_scores[train_idx]
|
| 94 |
+
fold_preds = fit_predict_fold(
|
| 95 |
+
train_texts=train_texts,
|
| 96 |
+
test_texts=test_texts,
|
| 97 |
+
train_targets=train_targets,
|
| 98 |
+
n_components=n_components,
|
| 99 |
+
n_neighbors=n_neighbors,
|
| 100 |
+
)
|
| 101 |
+
fold_preds = np.clip(fold_preds, 0.0, None)
|
| 102 |
+
zero_rows = fold_preds.sum(axis=1) <= 1e-12
|
| 103 |
+
if np.any(zero_rows):
|
| 104 |
+
fold_preds[zero_rows] = global_mean
|
| 105 |
+
preds[test_idx] = fold_preds
|
| 106 |
+
folds[test_idx] = fold_id
|
| 107 |
+
log.info("Finished fold %d/%d", fold_id + 1, n_splits)
|
| 108 |
+
|
| 109 |
+
return preds, folds
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def main() -> None:
|
| 113 |
+
parser = argparse.ArgumentParser()
|
| 114 |
+
parser.add_argument("--data-dir", default="data/raw/AffectiveText.Semeval.2007")
|
| 115 |
+
parser.add_argument("--output", default="data/processed/affective_text_open_oof_predictions.jsonl")
|
| 116 |
+
parser.add_argument("--n-splits", type=int, default=5)
|
| 117 |
+
parser.add_argument("--n-components", type=int, default=128)
|
| 118 |
+
parser.add_argument("--n-neighbors", type=int, default=25)
|
| 119 |
+
parser.add_argument("--seed", type=int, default=2026)
|
| 120 |
+
parser.add_argument("--limit", type=int, default=None)
|
| 121 |
+
parser.add_argument("--overwrite", action="store_true")
|
| 122 |
+
args = parser.parse_args()
|
| 123 |
+
|
| 124 |
+
output_path = Path(args.output)
|
| 125 |
+
if output_path.exists() and not args.overwrite:
|
| 126 |
+
raise FileExistsError(f"Output already exists: {output_path}")
|
| 127 |
+
|
| 128 |
+
data = load_affective_text(args.data_dir)
|
| 129 |
+
ids = data["ids"]
|
| 130 |
+
headlines = data["headlines"]
|
| 131 |
+
raw_scores = np.asarray(data["raw_scores"], dtype=float)
|
| 132 |
+
if args.limit is not None:
|
| 133 |
+
ids = ids[:args.limit]
|
| 134 |
+
headlines = headlines[:args.limit]
|
| 135 |
+
raw_scores = raw_scores[:args.limit]
|
| 136 |
+
|
| 137 |
+
pred_scores, folds = build_open_predictions(
|
| 138 |
+
headlines=headlines,
|
| 139 |
+
raw_scores=raw_scores,
|
| 140 |
+
n_splits=args.n_splits,
|
| 141 |
+
n_components=args.n_components,
|
| 142 |
+
n_neighbors=args.n_neighbors,
|
| 143 |
+
seed=args.seed,
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
macro_r = macro_pearson(raw_scores, pred_scores)
|
| 147 |
+
flat_r = float(np.corrcoef(raw_scores.reshape(-1), pred_scores.reshape(-1))[0, 1])
|
| 148 |
+
log.info(
|
| 149 |
+
"Open fallback predictor quality: macro Pearson=%.3f, flattened Pearson=%.3f",
|
| 150 |
+
macro_r,
|
| 151 |
+
flat_r,
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 155 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 156 |
+
for idx, headline, scores, fold_id in zip(ids, headlines, pred_scores, folds):
|
| 157 |
+
row = {
|
| 158 |
+
"id": idx,
|
| 159 |
+
"headline": headline,
|
| 160 |
+
"emotions": EMOTION_NAMES,
|
| 161 |
+
"scores": [float(x) for x in scores],
|
| 162 |
+
"provider": "open_fallback",
|
| 163 |
+
"model": "tfidf_svd_knn_oof",
|
| 164 |
+
"fold": int(fold_id),
|
| 165 |
+
"builder": {
|
| 166 |
+
"n_splits": int(args.n_splits),
|
| 167 |
+
"n_components": int(args.n_components),
|
| 168 |
+
"n_neighbors": int(args.n_neighbors),
|
| 169 |
+
"seed": int(args.seed),
|
| 170 |
+
},
|
| 171 |
+
"notes": "Deterministic out-of-fold TF-IDF+SVD+kNN regression fallback.",
|
| 172 |
+
}
|
| 173 |
+
f.write(json.dumps(row, ensure_ascii=True) + "\n")
|
| 174 |
+
|
| 175 |
+
log.info("Finished. Predictions cached at %s", output_path)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
if __name__ == "__main__":
|
| 179 |
+
main()
|
rebuild/affectivetext/rebuild_gold_labels.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.data import load_affective_text
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
root = Path(sys.argv[1])
|
| 5 |
+
data = load_affective_text(root)
|
| 6 |
+
print(data['Y'].shape)
|
rebuild/affectivetext/validate_cache_schema.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import json
|
| 3 |
+
import sys
|
| 4 |
+
p = Path(sys.argv[1])
|
| 5 |
+
with open(p) as f:
|
| 6 |
+
for i, line in enumerate(f, 1):
|
| 7 |
+
row = json.loads(line)
|
| 8 |
+
for field in ['id', 'scores', 'provider', 'prompt_template']:
|
| 9 |
+
if field not in row:
|
| 10 |
+
raise SystemExit(f'missing {field} at line {i}')
|
| 11 |
+
print('cache schema ok')
|
rebuild/cifar10/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# cifar10
|
| 2 |
+
|
| 3 |
+
Use the frozen CIFAR-10 softmax cache when available. If it is absent, regenerate the softmax predictions locally before exporting Y/U arrays.
|
rebuild/cifar10/rebuild_from_torchvision.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Placeholder wrapper generated for the upload bundle.
|
| 2 |
+
# Use the main scripts/ runners in this repository together with the
|
| 3 |
+
# task-specific README in the same directory.
|
rebuild/pbmc/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pbmc
|
| 2 |
+
|
| 3 |
+
Rebuild from PBMC3K, generate pseudobulk mixtures, and then freeze the deconvolution outputs.
|
rebuild/pbmc/generate_pseudobulk.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Placeholder wrapper generated for the upload bundle.
|
| 2 |
+
# Use the main scripts/ runners in this repository together with the
|
| 3 |
+
# task-specific README in the same directory.
|
rebuild/pbmc/rebuild_from_pbmc3k.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Placeholder wrapper generated for the upload bundle.
|
| 2 |
+
# Use the main scripts/ runners in this repository together with the
|
| 3 |
+
# task-specific README in the same directory.
|
rebuild/samson/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# samson
|
| 2 |
+
|
| 3 |
+
Rebuild from the public Samson benchmark bundle and freeze the NMF abundance outputs before conformal evaluation.
|
rebuild/samson/rebuild_from_public_bundle.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Placeholder wrapper generated for the upload bundle.
|
| 2 |
+
# Use the main scripts/ runners in this repository together with the
|
| 3 |
+
# task-specific README in the same directory.
|
rebuild/topics/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# topics
|
| 2 |
+
|
| 3 |
+
Rebuild the topic-mixture task from the public 20 Newsgroups fetcher, then freeze the derived Y/U arrays before running the benchmark.
|
rebuild/topics/rebuild_from_sklearn_fetcher.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Placeholder wrapper generated for the upload bundle.
|
| 2 |
+
# Use the main scripts/ runners in this repository together with the
|
| 3 |
+
# task-specific README in the same directory.
|
rebuild/utkface/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utkface
|
| 2 |
+
|
| 3 |
+
Rebuild derived age-distribution features from UTKFace locally; do not mirror the raw face-image archive.
|
rebuild/utkface/rebuild_from_utkface.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Placeholder wrapper generated for the upload bundle.
|
| 2 |
+
# Use the main scripts/ runners in this repository together with the
|
| 3 |
+
# task-specific README in the same directory.
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy>=1.24
|
| 2 |
+
scipy>=1.10
|
| 3 |
+
scikit-learn>=1.3
|
| 4 |
+
matplotlib>=3.7
|
| 5 |
+
pyyaml>=6.0
|
| 6 |
+
scanpy
|
| 7 |
+
anndata
|
| 8 |
+
rpy2
|
scripts/build_simplextasks_docs.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build task cards and benchmark docs for the SimplexTasks-12 release."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from textwrap import dedent
|
| 8 |
+
|
| 9 |
+
import yaml
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
| 13 |
+
RELEASE_ROOT = REPO_ROOT / "release" / "simplextasks-12"
|
| 14 |
+
DOCS_DIR = RELEASE_ROOT / "docs"
|
| 15 |
+
|
| 16 |
+
REAL_EXTRAS = {
|
| 17 |
+
"cifar10_softmax": {
|
| 18 |
+
"evaluation_role": "Extreme classification-style stress test for allocation failure under fixed class-probability vectors.",
|
| 19 |
+
"target_definition": "A 10-way one-hot class distribution paired with a frozen ResNet-18 softmax cache.",
|
| 20 |
+
"default_score": "Total variation / L1 on the simplex.",
|
| 21 |
+
"default_stratification": "Entropy bins of the softmax prediction.",
|
| 22 |
+
"limitations": "This is a classification proxy rather than a naturally continuous composition task. The release ships derived arrays only and does not mirror raw CIFAR-10 images.",
|
| 23 |
+
},
|
| 24 |
+
"topics_20ng": {
|
| 25 |
+
"evaluation_role": "Smooth-heterogeneity real task for topic-mixture prediction.",
|
| 26 |
+
"target_definition": "Fixed 10-topic compositions constructed once from the 20 Newsgroups corpus, with a TF-IDF-to-topic-mixture kNN regressor as the predictor.",
|
| 27 |
+
"default_score": "Aitchison distance.",
|
| 28 |
+
"default_stratification": "Entropy bins of the predicted topic mixture.",
|
| 29 |
+
"limitations": "Both the target and the predictor output are model-derived topic mixtures. The release exposes derived simplex arrays, not the raw text corpus.",
|
| 30 |
+
},
|
| 31 |
+
"samson_unmixing": {
|
| 32 |
+
"evaluation_role": "Natural low-dimensional compositional benchmark aligned with grouped repair.",
|
| 33 |
+
"target_definition": "Three-endmember abundance compositions paired with a frozen NMF abundance estimator.",
|
| 34 |
+
"default_score": "Aitchison distance.",
|
| 35 |
+
"default_stratification": "Boundary bins on the abundance prediction.",
|
| 36 |
+
"limitations": "The benchmark files are public and source-cited, but the upstream bundle did not include an explicit license file; downstream reuse should preserve attribution and the original benchmark context.",
|
| 37 |
+
},
|
| 38 |
+
"pbmc3k_pseudobulk": {
|
| 39 |
+
"evaluation_role": "Semi-synthetic control with known composition targets.",
|
| 40 |
+
"target_definition": "Pseudobulk cell-type fractions generated from PBMC3K together with a fixed NNLS deconvolution predictor.",
|
| 41 |
+
"default_score": "Aitchison distance.",
|
| 42 |
+
"default_stratification": "Boundary bins on the predicted cell-type fractions.",
|
| 43 |
+
"limitations": "This is a semi-synthetic control rather than a deployment-ready deconvolution benchmark. The grouped-repair story is sensitive to the pseudobulk concentration setting, which is documented in the paper appendix.",
|
| 44 |
+
},
|
| 45 |
+
"utkface_age_ldl": {
|
| 46 |
+
"evaluation_role": "Image-based label-distribution task with strong grouped heterogeneity.",
|
| 47 |
+
"target_definition": "A 10-bin age label distribution constructed from UTKFace ages and predicted from thumbnail image features using PCA+kNN regression.",
|
| 48 |
+
"default_score": "Aitchison distance.",
|
| 49 |
+
"default_stratification": "Entropy bins of the predicted age distribution.",
|
| 50 |
+
"limitations": "The underlying face images are not redistributed and remain subject to the source dataset's research-use constraints. The benchmark ships derived features and simplex arrays only.",
|
| 51 |
+
},
|
| 52 |
+
"affectivetext_emotions": {
|
| 53 |
+
"evaluation_role": "Small-sample weak-structure counterexample with a frozen language-model predictor.",
|
| 54 |
+
"target_definition": "Normalized six-emotion gold scores from SemEval-2007 AffectiveText paired with a frozen zero-shot scorer cache.",
|
| 55 |
+
"default_score": "Aitchison distance.",
|
| 56 |
+
"default_stratification": "Boundary bins on the predicted emotion mixture.",
|
| 57 |
+
"limitations": "The reported benchmark tables use a frozen closed-model cache; the release omits raw headlines and raw API responses. A fully open TF-IDF+SVD+kNN fallback cache-builder is included for local reproduction, but it is not identical to the main predictor used in the paper tables.",
|
| 58 |
+
},
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
SYNTH_EXTRAS = {
|
| 62 |
+
"d1_homogeneous": {
|
| 63 |
+
"evaluation_role": "Negative control with no residual-scale heterogeneity.",
|
| 64 |
+
"limitations": "Intended to confirm that the benchmark does not manufacture allocation failures when the DGP is homogeneous.",
|
| 65 |
+
},
|
| 66 |
+
"d2_pure_scale": {
|
| 67 |
+
"evaluation_role": "Canonical smooth-scale heterogeneity regime used to motivate normalization-based repair.",
|
| 68 |
+
"limitations": "This regime isolates scale variation only; it is not intended to cover bias-type misspecification or changing tail shape.",
|
| 69 |
+
},
|
| 70 |
+
"d3_discrete_groups_aligned": {
|
| 71 |
+
"evaluation_role": "Aligned discrete-group regime where group-wise calibration should have an inductive advantage.",
|
| 72 |
+
"limitations": "The grouping is built into the DGP. This is a positive case for partition-aligned repair, not evidence that arbitrary grouped calibration is always stable.",
|
| 73 |
+
},
|
| 74 |
+
"d4_model_bias": {
|
| 75 |
+
"evaluation_role": "Bias-type counterexample where changing the wrapper alone is not enough.",
|
| 76 |
+
"limitations": "This regime is not reducible to a single local scale field, so normalization cannot be expected to fully repair it.",
|
| 77 |
+
},
|
| 78 |
+
"d5_heavy_tail": {
|
| 79 |
+
"evaluation_role": "Smooth-scale regime with heavy tails to stress robustness beyond Gaussian residuals.",
|
| 80 |
+
"limitations": "The regime is still synthetic and targeted; it does not span every possible deviation from the baseline score distribution.",
|
| 81 |
+
},
|
| 82 |
+
"d6_high_k": {
|
| 83 |
+
"evaluation_role": "High-dimensional simplex regime exposing the compute and allocation challenges that appear when K is large.",
|
| 84 |
+
"limitations": "Benchmark-scale exact methods are too expensive here; the paper marks the exact-reference entries separately as D6 dagger cells.",
|
| 85 |
+
},
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def load_json(path: Path) -> dict:
|
| 90 |
+
with open(path) as f:
|
| 91 |
+
return json.load(f)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def load_yaml(path: Path) -> dict:
|
| 95 |
+
with open(path) as f:
|
| 96 |
+
return yaml.safe_load(f)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def write(path: Path, text: str) -> None:
|
| 100 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 101 |
+
path.write_text(text.rstrip() + "\n")
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def bullet_list(items: list[str]) -> str:
|
| 105 |
+
return "\n".join(f"- {item}" for item in items)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def synthetic_task_card(task_dir: Path, metadata: dict, config: dict, extra: dict) -> str:
|
| 109 |
+
evaluation = config["evaluation"]
|
| 110 |
+
dgp = config["dgp"]
|
| 111 |
+
data = config["data"]
|
| 112 |
+
lines = [
|
| 113 |
+
f"# {metadata['task_name']} Task Card",
|
| 114 |
+
"",
|
| 115 |
+
f"- Task ID: `{metadata['task_id']}`",
|
| 116 |
+
"- Subset: synthetic",
|
| 117 |
+
f"- Samples: `{metadata['n_samples']}`",
|
| 118 |
+
f"- Simplex dimension: `{metadata['simplex_dim']}`",
|
| 119 |
+
f"- Predictor: {metadata['predictor']}",
|
| 120 |
+
f"- Regime label: {metadata['regime_label']}",
|
| 121 |
+
"",
|
| 122 |
+
"## Evaluation Role",
|
| 123 |
+
"",
|
| 124 |
+
extra["evaluation_role"],
|
| 125 |
+
"",
|
| 126 |
+
"## DGP Summary",
|
| 127 |
+
"",
|
| 128 |
+
f"- DGP family: `{dgp['name']}`",
|
| 129 |
+
"- Default score: Aitchison distance",
|
| 130 |
+
f"- Default stratification: `{evaluation['strata_method']}` with `{evaluation['n_strata']}` strata",
|
| 131 |
+
f"- Calibration size: `{data['n_cal']}`",
|
| 132 |
+
f"- Test size: `{data['n_test']}`",
|
| 133 |
+
f"- Repetitions: `{data['n_rep']}`",
|
| 134 |
+
"",
|
| 135 |
+
"## Release Contents",
|
| 136 |
+
"",
|
| 137 |
+
bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
|
| 138 |
+
"",
|
| 139 |
+
"## Provenance And Rebuild",
|
| 140 |
+
"",
|
| 141 |
+
f"- Source asset: {metadata['source_asset']}",
|
| 142 |
+
f"- Config file: `{metadata['config_file']}`",
|
| 143 |
+
f"- Redistribution: `{metadata['redistribution']}`",
|
| 144 |
+
f"- Seed: `{metadata['seed']}`",
|
| 145 |
+
"",
|
| 146 |
+
"## Limitations",
|
| 147 |
+
"",
|
| 148 |
+
extra["limitations"],
|
| 149 |
+
]
|
| 150 |
+
return "\n".join(lines)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def real_task_card(task_dir: Path, metadata: dict, extra: dict) -> str:
|
| 154 |
+
lines = [
|
| 155 |
+
f"# {metadata['task_name']} Task Card",
|
| 156 |
+
"",
|
| 157 |
+
f"- Task ID: `{metadata['task_id']}`",
|
| 158 |
+
"- Subset: real",
|
| 159 |
+
f"- Samples: `{metadata['n_samples']}`",
|
| 160 |
+
f"- Simplex dimension: `{metadata['simplex_dim']}`",
|
| 161 |
+
f"- Predictor: {metadata['predictor']}",
|
| 162 |
+
"",
|
| 163 |
+
"## Evaluation Role",
|
| 164 |
+
"",
|
| 165 |
+
extra["evaluation_role"],
|
| 166 |
+
"",
|
| 167 |
+
"## Target And Predictor",
|
| 168 |
+
"",
|
| 169 |
+
extra["target_definition"],
|
| 170 |
+
"",
|
| 171 |
+
"## Default Benchmark Settings",
|
| 172 |
+
"",
|
| 173 |
+
f"- Default score: {extra['default_score']}",
|
| 174 |
+
f"- Default stratification: {extra['default_stratification']}",
|
| 175 |
+
f"- Redistribution: `{metadata['redistribution']}`",
|
| 176 |
+
"",
|
| 177 |
+
"## Release Contents",
|
| 178 |
+
"",
|
| 179 |
+
bullet_list([f"`{name}`" for name in metadata["available_arrays"]]),
|
| 180 |
+
"",
|
| 181 |
+
"## Provenance And Usage Notes",
|
| 182 |
+
"",
|
| 183 |
+
f"- Source asset: {metadata['source_asset']}",
|
| 184 |
+
f"- Metadata note: {metadata['notes']}",
|
| 185 |
+
"",
|
| 186 |
+
"## Limitations",
|
| 187 |
+
"",
|
| 188 |
+
extra["limitations"],
|
| 189 |
+
]
|
| 190 |
+
return "\n".join(lines)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def build_task_cards() -> None:
|
| 194 |
+
for task_dir in sorted((RELEASE_ROOT / "real").glob("*")):
|
| 195 |
+
metadata = load_json(task_dir / "metadata.json")
|
| 196 |
+
extra = REAL_EXTRAS[metadata["task_id"]]
|
| 197 |
+
write(task_dir / "task_card.md", real_task_card(task_dir, metadata, extra))
|
| 198 |
+
|
| 199 |
+
for task_dir in sorted((RELEASE_ROOT / "synthetic").glob("*")):
|
| 200 |
+
metadata = load_json(task_dir / "metadata.json")
|
| 201 |
+
config = load_yaml(task_dir / "config.yaml")
|
| 202 |
+
extra = SYNTH_EXTRAS[metadata["task_id"]]
|
| 203 |
+
write(task_dir / "task_card.md", synthetic_task_card(task_dir, metadata, config, extra))
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def build_docs() -> None:
|
| 207 |
+
benchmark_card = dedent(
|
| 208 |
+
"""
|
| 209 |
+
# SimplexTasks-12 Benchmark Card
|
| 210 |
+
|
| 211 |
+
SimplexTasks-12 is the processed task collection behind the SimplexUQ benchmark. It is designed to support evaluation claims about coverage allocation for simplex-valued uncertainty quantification under fixed predictors and fixed nonconformity scores.
|
| 212 |
+
|
| 213 |
+
## Supported Claims
|
| 214 |
+
|
| 215 |
+
- Whether a fixed simplex-valued predictor exhibits allocation failure beyond its marginal coverage.
|
| 216 |
+
- Which heterogeneity regime best describes the observed failure pattern.
|
| 217 |
+
- Which conformal wrapper family is most competitive under the chosen task and stratification protocol.
|
| 218 |
+
|
| 219 |
+
## Claims The Benchmark Does Not Support
|
| 220 |
+
|
| 221 |
+
- Universal wrapper rankings across all simplex tasks.
|
| 222 |
+
- Deployment-readiness claims for any predictor.
|
| 223 |
+
- Raw-data redistribution rights beyond the terms documented in `LICENSE_NOTES.md`.
|
| 224 |
+
|
| 225 |
+
## Benchmark Contents
|
| 226 |
+
|
| 227 |
+
- 6 synthetic regimes with canonical `task.npz` bundles and copied YAML configs.
|
| 228 |
+
- 6 fixed-predictor real tasks with cached `(Y, U)` arrays or derived features.
|
| 229 |
+
- Per-task `task_card.md` files and `metadata.json` provenance records.
|
| 230 |
+
- Release-level rebuild instructions for the paper tables and figures.
|
| 231 |
+
|
| 232 |
+
## Reproducibility Contract
|
| 233 |
+
|
| 234 |
+
- Benchmark evaluation always operates on frozen predictor outputs.
|
| 235 |
+
- Default stratification rules are fixed before wrapper comparison.
|
| 236 |
+
- Restricted raw assets are replaced by derived arrays plus rebuild notes.
|
| 237 |
+
- The paper figures and tables can be regenerated from cached outputs with the commands in the release `README.md`.
|
| 238 |
+
|
| 239 |
+
## Responsible Use
|
| 240 |
+
|
| 241 |
+
Some tasks rely on non-redistributable source assets or derived features from sensitive domains such as face images. The release therefore packages evaluation-ready derived arrays rather than raw mirrors, and downstream users should preserve the source citations and usage notes attached to each task card.
|
| 242 |
+
"""
|
| 243 |
+
).strip()
|
| 244 |
+
|
| 245 |
+
evaluation_protocol = dedent(
|
| 246 |
+
"""
|
| 247 |
+
# SimplexTasks-12 Evaluation Protocol
|
| 248 |
+
|
| 249 |
+
## Fixed-Predictor Principle
|
| 250 |
+
|
| 251 |
+
Every benchmark slice fixes the predictor first and then varies only the conformal wrapper. This keeps the evaluation target on uncertainty allocation rather than on predictor training.
|
| 252 |
+
|
| 253 |
+
## Default Scores
|
| 254 |
+
|
| 255 |
+
- Real and synthetic composition tasks use Aitchison distance unless the target lies on simplex vertices.
|
| 256 |
+
- CIFAR-10 uses total variation / L1 because Aitchison distance is ill-defined at the one-hot boundary.
|
| 257 |
+
|
| 258 |
+
## Stratification Rules
|
| 259 |
+
|
| 260 |
+
- Default strata are entropy bins, boundary bins, or task-specific dominant-group partitions.
|
| 261 |
+
- Sensitivity sweeps use fixed alternative stratifications defined from cached prediction vectors only.
|
| 262 |
+
- Stratification maps are not tuned per wrapper and do not depend on calibration/test responses.
|
| 263 |
+
|
| 264 |
+
## Main Metrics
|
| 265 |
+
|
| 266 |
+
- Marginal coverage.
|
| 267 |
+
- Max disparity across prediction-space strata.
|
| 268 |
+
- Worst-stratum coverage.
|
| 269 |
+
- Coverage variance.
|
| 270 |
+
- SSCV and mean radius; low-dimensional synthetic tasks also report a simplex-volume ratio.
|
| 271 |
+
|
| 272 |
+
## Wrapper Families
|
| 273 |
+
|
| 274 |
+
- Global split conformal.
|
| 275 |
+
- Group-wise / Mondrian conformal.
|
| 276 |
+
- Two-stage normalization.
|
| 277 |
+
- Exact or leave-one-out references where affordable.
|
| 278 |
+
- Diagnostic variants such as OneShot, TrainRes, and the current weighted implementation.
|
| 279 |
+
|
| 280 |
+
## Output Interpretation
|
| 281 |
+
|
| 282 |
+
The benchmark is designed to compare wrapper families under visible allocation-efficiency-compute tradeoffs. It should not be reduced to a single leaderboard or read as a conditional-coverage certification protocol.
|
| 283 |
+
"""
|
| 284 |
+
).strip()
|
| 285 |
+
|
| 286 |
+
write(DOCS_DIR / "benchmark_card.md", benchmark_card)
|
| 287 |
+
write(DOCS_DIR / "evaluation_protocol.md", evaluation_protocol)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def main() -> None:
|
| 291 |
+
build_task_cards()
|
| 292 |
+
build_docs()
|
| 293 |
+
print("Built SimplexTasks-12 task cards and docs.")
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
if __name__ == "__main__":
|
| 297 |
+
main()
|
scripts/cache_affective_text_open_predictions.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build a fully open AffectiveText prediction cache with out-of-fold regressors."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
from sklearn.decomposition import TruncatedSVD
|
| 11 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 12 |
+
from sklearn.model_selection import KFold
|
| 13 |
+
from sklearn.neighbors import KNeighborsRegressor
|
| 14 |
+
from sklearn.preprocessing import Normalizer
|
| 15 |
+
|
| 16 |
+
import sys
|
| 17 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 18 |
+
|
| 19 |
+
from src.data import EMOTION_NAMES, load_affective_text
|
| 20 |
+
|
| 21 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| 22 |
+
log = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def macro_pearson(a: np.ndarray, b: np.ndarray) -> float:
|
| 26 |
+
vals = []
|
| 27 |
+
for j in range(a.shape[1]):
|
| 28 |
+
aj = a[:, j]
|
| 29 |
+
bj = b[:, j]
|
| 30 |
+
if np.std(aj) <= 1e-12 or np.std(bj) <= 1e-12:
|
| 31 |
+
continue
|
| 32 |
+
vals.append(float(np.corrcoef(aj, bj)[0, 1]))
|
| 33 |
+
return float(np.mean(vals)) if vals else float("nan")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def fit_predict_fold(
|
| 37 |
+
train_texts: list[str],
|
| 38 |
+
test_texts: list[str],
|
| 39 |
+
train_targets: np.ndarray,
|
| 40 |
+
n_components: int,
|
| 41 |
+
n_neighbors: int,
|
| 42 |
+
) -> np.ndarray:
|
| 43 |
+
vectorizer = TfidfVectorizer(
|
| 44 |
+
lowercase=True,
|
| 45 |
+
strip_accents="unicode",
|
| 46 |
+
sublinear_tf=True,
|
| 47 |
+
ngram_range=(1, 2),
|
| 48 |
+
min_df=1,
|
| 49 |
+
max_df=0.95,
|
| 50 |
+
stop_words="english",
|
| 51 |
+
)
|
| 52 |
+
x_train = vectorizer.fit_transform(train_texts)
|
| 53 |
+
x_test = vectorizer.transform(test_texts)
|
| 54 |
+
|
| 55 |
+
max_rank = min(x_train.shape[0] - 1, x_train.shape[1] - 1)
|
| 56 |
+
if max_rank >= 2:
|
| 57 |
+
rank = min(n_components, max_rank)
|
| 58 |
+
svd = TruncatedSVD(n_components=rank, random_state=0)
|
| 59 |
+
normalizer = Normalizer(copy=False)
|
| 60 |
+
x_train = normalizer.fit_transform(svd.fit_transform(x_train))
|
| 61 |
+
x_test = normalizer.transform(svd.transform(x_test))
|
| 62 |
+
else:
|
| 63 |
+
x_train = x_train.toarray()
|
| 64 |
+
x_test = x_test.toarray()
|
| 65 |
+
|
| 66 |
+
knn = KNeighborsRegressor(
|
| 67 |
+
n_neighbors=min(n_neighbors, len(train_texts)),
|
| 68 |
+
weights="distance",
|
| 69 |
+
metric="minkowski",
|
| 70 |
+
p=2,
|
| 71 |
+
)
|
| 72 |
+
knn.fit(x_train, train_targets)
|
| 73 |
+
return np.asarray(knn.predict(x_test), dtype=float)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def build_open_predictions(
|
| 77 |
+
headlines: list[str],
|
| 78 |
+
raw_scores: np.ndarray,
|
| 79 |
+
n_splits: int,
|
| 80 |
+
n_components: int,
|
| 81 |
+
n_neighbors: int,
|
| 82 |
+
seed: int,
|
| 83 |
+
) -> tuple[np.ndarray, np.ndarray]:
|
| 84 |
+
n = len(headlines)
|
| 85 |
+
preds = np.zeros_like(raw_scores, dtype=float)
|
| 86 |
+
folds = np.full(n, -1, dtype=int)
|
| 87 |
+
splitter = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
|
| 88 |
+
global_mean = raw_scores.mean(axis=0)
|
| 89 |
+
|
| 90 |
+
for fold_id, (train_idx, test_idx) in enumerate(splitter.split(headlines)):
|
| 91 |
+
train_texts = [headlines[i] for i in train_idx]
|
| 92 |
+
test_texts = [headlines[i] for i in test_idx]
|
| 93 |
+
train_targets = raw_scores[train_idx]
|
| 94 |
+
fold_preds = fit_predict_fold(
|
| 95 |
+
train_texts=train_texts,
|
| 96 |
+
test_texts=test_texts,
|
| 97 |
+
train_targets=train_targets,
|
| 98 |
+
n_components=n_components,
|
| 99 |
+
n_neighbors=n_neighbors,
|
| 100 |
+
)
|
| 101 |
+
fold_preds = np.clip(fold_preds, 0.0, None)
|
| 102 |
+
zero_rows = fold_preds.sum(axis=1) <= 1e-12
|
| 103 |
+
if np.any(zero_rows):
|
| 104 |
+
fold_preds[zero_rows] = global_mean
|
| 105 |
+
preds[test_idx] = fold_preds
|
| 106 |
+
folds[test_idx] = fold_id
|
| 107 |
+
log.info("Finished fold %d/%d", fold_id + 1, n_splits)
|
| 108 |
+
|
| 109 |
+
return preds, folds
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def main() -> None:
|
| 113 |
+
parser = argparse.ArgumentParser()
|
| 114 |
+
parser.add_argument("--data-dir", default="data/raw/AffectiveText.Semeval.2007")
|
| 115 |
+
parser.add_argument("--output", default="data/processed/affective_text_open_oof_predictions.jsonl")
|
| 116 |
+
parser.add_argument("--n-splits", type=int, default=5)
|
| 117 |
+
parser.add_argument("--n-components", type=int, default=128)
|
| 118 |
+
parser.add_argument("--n-neighbors", type=int, default=25)
|
| 119 |
+
parser.add_argument("--seed", type=int, default=2026)
|
| 120 |
+
parser.add_argument("--limit", type=int, default=None)
|
| 121 |
+
parser.add_argument("--overwrite", action="store_true")
|
| 122 |
+
args = parser.parse_args()
|
| 123 |
+
|
| 124 |
+
output_path = Path(args.output)
|
| 125 |
+
if output_path.exists() and not args.overwrite:
|
| 126 |
+
raise FileExistsError(f"Output already exists: {output_path}")
|
| 127 |
+
|
| 128 |
+
data = load_affective_text(args.data_dir)
|
| 129 |
+
ids = data["ids"]
|
| 130 |
+
headlines = data["headlines"]
|
| 131 |
+
raw_scores = np.asarray(data["raw_scores"], dtype=float)
|
| 132 |
+
if args.limit is not None:
|
| 133 |
+
ids = ids[:args.limit]
|
| 134 |
+
headlines = headlines[:args.limit]
|
| 135 |
+
raw_scores = raw_scores[:args.limit]
|
| 136 |
+
|
| 137 |
+
pred_scores, folds = build_open_predictions(
|
| 138 |
+
headlines=headlines,
|
| 139 |
+
raw_scores=raw_scores,
|
| 140 |
+
n_splits=args.n_splits,
|
| 141 |
+
n_components=args.n_components,
|
| 142 |
+
n_neighbors=args.n_neighbors,
|
| 143 |
+
seed=args.seed,
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
macro_r = macro_pearson(raw_scores, pred_scores)
|
| 147 |
+
flat_r = float(np.corrcoef(raw_scores.reshape(-1), pred_scores.reshape(-1))[0, 1])
|
| 148 |
+
log.info(
|
| 149 |
+
"Open fallback predictor quality: macro Pearson=%.3f, flattened Pearson=%.3f",
|
| 150 |
+
macro_r,
|
| 151 |
+
flat_r,
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 155 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 156 |
+
for idx, headline, scores, fold_id in zip(ids, headlines, pred_scores, folds):
|
| 157 |
+
row = {
|
| 158 |
+
"id": idx,
|
| 159 |
+
"headline": headline,
|
| 160 |
+
"emotions": EMOTION_NAMES,
|
| 161 |
+
"scores": [float(x) for x in scores],
|
| 162 |
+
"provider": "open_fallback",
|
| 163 |
+
"model": "tfidf_svd_knn_oof",
|
| 164 |
+
"fold": int(fold_id),
|
| 165 |
+
"builder": {
|
| 166 |
+
"n_splits": int(args.n_splits),
|
| 167 |
+
"n_components": int(args.n_components),
|
| 168 |
+
"n_neighbors": int(args.n_neighbors),
|
| 169 |
+
"seed": int(args.seed),
|
| 170 |
+
},
|
| 171 |
+
"notes": "Deterministic out-of-fold TF-IDF+SVD+kNN regression fallback.",
|
| 172 |
+
}
|
| 173 |
+
f.write(json.dumps(row, ensure_ascii=True) + "\n")
|
| 174 |
+
|
| 175 |
+
log.info("Finished. Predictions cached at %s", output_path)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
if __name__ == "__main__":
|
| 179 |
+
main()
|
scripts/cache_affective_text_predictions.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Cache zero-shot API emotion scores for SemEval-2007 Affective Text."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import time
|
| 10 |
+
import urllib.error
|
| 11 |
+
import urllib.parse
|
| 12 |
+
import urllib.request
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
import sys
|
| 16 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 17 |
+
|
| 18 |
+
from src.data import EMOTION_NAMES, load_affective_text, load_prediction_cache
|
| 19 |
+
|
| 20 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| 21 |
+
log = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
PROMPT_TEMPLATE = (
|
| 24 |
+
'Rate the following news headline on 6 emotions: anger, disgust, fear, joy, sadness, surprise. '
|
| 25 |
+
'Return only 6 numbers from 0 to 100, comma-separated, in that order.\n'
|
| 26 |
+
'Headline: "{headline}"\n'
|
| 27 |
+
"Scores:"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def parse_scores(text: str) -> list[float]:
|
| 32 |
+
nums = re.findall(r"-?\d+(?:\.\d+)?", text)
|
| 33 |
+
if len(nums) < 6:
|
| 34 |
+
raise ValueError(f"Could not parse 6 scores from response: {text!r}")
|
| 35 |
+
scores = [max(float(x), 0.0) for x in nums[:6]]
|
| 36 |
+
if sum(scores) <= 0:
|
| 37 |
+
raise ValueError(f"Parsed zero-sum scores from response: {text!r}")
|
| 38 |
+
return scores
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def call_openai_chat_completions(
|
| 42 |
+
headline: str,
|
| 43 |
+
model: str,
|
| 44 |
+
api_key: str,
|
| 45 |
+
base_url: str,
|
| 46 |
+
timeout_sec: float,
|
| 47 |
+
) -> tuple[str, dict]:
|
| 48 |
+
prompt = PROMPT_TEMPLATE.format(headline=headline)
|
| 49 |
+
payload = {
|
| 50 |
+
"model": model,
|
| 51 |
+
"messages": [
|
| 52 |
+
{"role": "system", "content": "You are a precise annotation model."},
|
| 53 |
+
{"role": "user", "content": prompt},
|
| 54 |
+
],
|
| 55 |
+
"temperature": 0,
|
| 56 |
+
}
|
| 57 |
+
req = urllib.request.Request(
|
| 58 |
+
url=base_url.rstrip("/") + "/chat/completions",
|
| 59 |
+
data=json.dumps(payload).encode("utf-8"),
|
| 60 |
+
headers={
|
| 61 |
+
"Content-Type": "application/json",
|
| 62 |
+
"Authorization": f"Bearer {api_key}",
|
| 63 |
+
},
|
| 64 |
+
method="POST",
|
| 65 |
+
)
|
| 66 |
+
with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
|
| 67 |
+
body = json.loads(resp.read().decode("utf-8"))
|
| 68 |
+
text = body["choices"][0]["message"]["content"]
|
| 69 |
+
return text, body
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def call_gemini_generate_content(
|
| 73 |
+
headline: str,
|
| 74 |
+
model: str,
|
| 75 |
+
api_key: str,
|
| 76 |
+
base_url: str,
|
| 77 |
+
timeout_sec: float,
|
| 78 |
+
) -> tuple[str, dict]:
|
| 79 |
+
prompt = PROMPT_TEMPLATE.format(headline=headline)
|
| 80 |
+
payload = {
|
| 81 |
+
"contents": [
|
| 82 |
+
{
|
| 83 |
+
"role": "user",
|
| 84 |
+
"parts": [{"text": prompt}],
|
| 85 |
+
}
|
| 86 |
+
],
|
| 87 |
+
"generationConfig": {
|
| 88 |
+
"temperature": 0,
|
| 89 |
+
},
|
| 90 |
+
}
|
| 91 |
+
url = (
|
| 92 |
+
base_url.rstrip("/")
|
| 93 |
+
+ f"/models/{model}:generateContent?key={urllib.parse.quote(api_key)}"
|
| 94 |
+
)
|
| 95 |
+
req = urllib.request.Request(
|
| 96 |
+
url=url,
|
| 97 |
+
data=json.dumps(payload).encode("utf-8"),
|
| 98 |
+
headers={"Content-Type": "application/json"},
|
| 99 |
+
method="POST",
|
| 100 |
+
)
|
| 101 |
+
with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
|
| 102 |
+
body = json.loads(resp.read().decode("utf-8"))
|
| 103 |
+
candidates = body.get("candidates", [])
|
| 104 |
+
if not candidates:
|
| 105 |
+
raise KeyError(f"No Gemini candidates in response: {body}")
|
| 106 |
+
parts = candidates[0].get("content", {}).get("parts", [])
|
| 107 |
+
text = "\n".join(part.get("text", "") for part in parts if part.get("text"))
|
| 108 |
+
if not text:
|
| 109 |
+
raise KeyError(f"No text parts in Gemini response: {body}")
|
| 110 |
+
return text, body
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def main():
|
| 114 |
+
parser = argparse.ArgumentParser()
|
| 115 |
+
parser.add_argument("--data-dir", default="data/raw/AffectiveText.Semeval.2007")
|
| 116 |
+
parser.add_argument("--output", default="data/processed/affective_text_predictions.jsonl")
|
| 117 |
+
parser.add_argument("--provider", choices=["openai", "gemini"], default="gemini")
|
| 118 |
+
parser.add_argument("--model", default=None)
|
| 119 |
+
parser.add_argument("--base-url", default=None)
|
| 120 |
+
parser.add_argument("--api-key-env", default=None)
|
| 121 |
+
parser.add_argument("--limit", type=int, default=None)
|
| 122 |
+
parser.add_argument("--sleep-sec", type=float, default=0.0)
|
| 123 |
+
parser.add_argument("--timeout-sec", type=float, default=60.0)
|
| 124 |
+
parser.add_argument("--overwrite", action="store_true")
|
| 125 |
+
args = parser.parse_args()
|
| 126 |
+
|
| 127 |
+
if args.model is None:
|
| 128 |
+
if args.provider == "gemini":
|
| 129 |
+
args.model = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001")
|
| 130 |
+
else:
|
| 131 |
+
args.model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini-2024-07-18")
|
| 132 |
+
if args.base_url is None:
|
| 133 |
+
if args.provider == "gemini":
|
| 134 |
+
args.base_url = os.environ.get("GEMINI_BASE_URL", "https://generativelanguage.googleapis.com/v1beta")
|
| 135 |
+
else:
|
| 136 |
+
args.base_url = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
|
| 137 |
+
if args.api_key_env is None:
|
| 138 |
+
args.api_key_env = "GEMINI_API_KEY" if args.provider == "gemini" else "OPENAI_API_KEY"
|
| 139 |
+
|
| 140 |
+
api_key = os.environ.get(args.api_key_env)
|
| 141 |
+
if not api_key:
|
| 142 |
+
raise EnvironmentError(f"Missing API key in env var {args.api_key_env}")
|
| 143 |
+
|
| 144 |
+
data = load_affective_text(args.data_dir)
|
| 145 |
+
ids = data["ids"]
|
| 146 |
+
headlines = data["headlines"]
|
| 147 |
+
if args.limit is not None:
|
| 148 |
+
ids = ids[:args.limit]
|
| 149 |
+
headlines = headlines[:args.limit]
|
| 150 |
+
|
| 151 |
+
out_path = Path(args.output)
|
| 152 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
| 153 |
+
existing = {}
|
| 154 |
+
if out_path.exists() and not args.overwrite:
|
| 155 |
+
existing = load_prediction_cache(out_path)
|
| 156 |
+
log.info(f"Loaded {len(existing)} cached predictions from {out_path}")
|
| 157 |
+
|
| 158 |
+
n_done = 0
|
| 159 |
+
with open(out_path, "a" if existing and not args.overwrite else "w", encoding="utf-8") as f:
|
| 160 |
+
for idx, headline in zip(ids, headlines):
|
| 161 |
+
if idx in existing and not args.overwrite:
|
| 162 |
+
continue
|
| 163 |
+
try:
|
| 164 |
+
if args.provider == "gemini":
|
| 165 |
+
raw_text, raw_json = call_gemini_generate_content(
|
| 166 |
+
headline=headline,
|
| 167 |
+
model=args.model,
|
| 168 |
+
api_key=api_key,
|
| 169 |
+
base_url=args.base_url,
|
| 170 |
+
timeout_sec=args.timeout_sec,
|
| 171 |
+
)
|
| 172 |
+
else:
|
| 173 |
+
raw_text, raw_json = call_openai_chat_completions(
|
| 174 |
+
headline=headline,
|
| 175 |
+
model=args.model,
|
| 176 |
+
api_key=api_key,
|
| 177 |
+
base_url=args.base_url,
|
| 178 |
+
timeout_sec=args.timeout_sec,
|
| 179 |
+
)
|
| 180 |
+
scores = parse_scores(raw_text)
|
| 181 |
+
except (urllib.error.URLError, urllib.error.HTTPError, ValueError, KeyError) as exc:
|
| 182 |
+
log.error(f"Failed on id={idx}: {exc}")
|
| 183 |
+
continue
|
| 184 |
+
|
| 185 |
+
row = {
|
| 186 |
+
"id": idx,
|
| 187 |
+
"headline": headline,
|
| 188 |
+
"emotions": EMOTION_NAMES,
|
| 189 |
+
"scores": scores,
|
| 190 |
+
"provider": args.provider,
|
| 191 |
+
"model": args.model,
|
| 192 |
+
"base_url": args.base_url,
|
| 193 |
+
"prompt_template": PROMPT_TEMPLATE,
|
| 194 |
+
"raw_text": raw_text,
|
| 195 |
+
"raw_response": raw_json,
|
| 196 |
+
}
|
| 197 |
+
f.write(json.dumps(row, ensure_ascii=True) + "\n")
|
| 198 |
+
f.flush()
|
| 199 |
+
n_done += 1
|
| 200 |
+
if n_done % 50 == 0:
|
| 201 |
+
log.info(f"Cached {n_done} new predictions")
|
| 202 |
+
if args.sleep_sec > 0:
|
| 203 |
+
time.sleep(args.sleep_sec)
|
| 204 |
+
|
| 205 |
+
log.info(f"Finished. Predictions cached at {out_path}")
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
if __name__ == "__main__":
|
| 209 |
+
main()
|