File size: 40,859 Bytes
51ff6e9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 | """
ablation_configs.py
====================
The ablation matrix for the three-band SVAE validation sweep.
Each config is a dict of overrides on the baseline PatchSVAE_F trainer.
The trainer expects:
- band: 'LOW' | 'MID' | 'HIGH' (selects the base architecture)
- variant: unique identifier for this variant within the group
- seed: random seed
- phase: 1 (1000-batch triage) | 2 (30-epoch full)
- overrides: dict of RunConfig field overrides
Three band representatives (kept constant across every test):
LOW: S=64, V=64, D=16, h=64, d=1, patch=16, 184K params, CV target β 0.21
MID: S=64, V=64, D=8, h=64, d=1, patch=16, 183K params, CV target β 0.39
HIGH: S=64, V=32, D=4, h=64, d=1, patch=4, 41K params, CV target β 1.10
Phase 1 early-stop:
- LOW/MID bands: train to batch 1000, record CV_ema, classify band
- HIGH band: train to batch 100, record CV_ema, classify band
Phase 2 full run:
- Group E (soft-hand): 30 epochs, 10 seeds per variant
- Group H (SVD necessity): 30 epochs, 3 seeds per variant
"""
from typing import Dict, List, Any
from dataclasses import dataclass, field, asdict
# ----------------------------------------------------------------------------
# Band representatives β the three anchor configs
# ----------------------------------------------------------------------------
BAND_REPS = {
'LOW': {
'img_size': 64,
'V': 64,
'D': 16,
'hidden': 64,
'depth': 1,
'patch_size': 16,
'n_cross': 1,
'expected_cv': 0.21,
'expected_params': 184_000,
},
'MID': {
'img_size': 64,
'V': 64,
'D': 8,
'hidden': 64,
'depth': 1,
'patch_size': 16,
'n_cross': 1,
'expected_cv': 0.39,
'expected_params': 183_000,
},
'HIGH': {
'img_size': 64,
'V': 32,
'D': 4,
'hidden': 64,
'depth': 1,
'patch_size': 4,
'n_cross': 1,
'expected_cv': 1.10,
'expected_params': 41_000,
},
}
def band_classifier(cv_ema: float) -> str:
"""Classify a final CV-EMA value into a band."""
if cv_ema < 0.30:
return 'LOW'
elif cv_ema < 0.55:
return 'MID'
elif cv_ema > 0.80:
return 'HIGH'
return 'UNCLASSIFIED'
def phase1_batch_limit(band: str) -> int:
"""How many batches to train before stopping for Phase 1 band classification."""
if band == 'HIGH':
return 100
return 1000
def phase2_batch_limit(config: Dict[str, Any]) -> int:
"""How many batches per epoch for Phase 2.
Per-config override: if the config specifies 'batch_limit', use it.
This allows the floor sweep (P group) to cap at a few dozen batches
without changing defaults for existing phase-2 configs.
Default behavior (unchanged):
- Adam at batch_size=256: 1_000_000 / 256 β 3900 batches
- LBFGS at batch_size=32: normally 31250 batches, but LBFGS
does 20 inner iterations per outer step so ~40k gradient steps
per batch β we cap at 2000 outer batches = ~40k gradient steps
which is plenty for within-attractor convergence
The batch_size is read from the config (Phase 2 configs include
an explicit batch_size field).
"""
# Per-config explicit batch_limit takes precedence
if 'batch_limit' in config:
return config['batch_limit']
overrides = config.get('overrides', {})
if overrides.get('optimizer') == 'lbfgs':
return 2000 # cap for LBFGS wallclock
batch_size = config.get('batch_size', 256)
return 1_000_000 // batch_size
# ----------------------------------------------------------------------------
# Ablation group definitions
# ----------------------------------------------------------------------------
def group_A_seed_replication() -> List[Dict[str, Any]]:
"""Reproducibility: 5 seeds Γ 3 bands = 15 runs.
Tests whether each band reproducibly appears across random inits.
Acceptance: >=4/5 seeds per band within +/-0.02 of expected CV.
"""
configs = []
for band in ['LOW', 'MID', 'HIGH']:
for seed in range(5):
configs.append({
'group': 'A',
'variant': 'baseline',
'band': band,
'seed': seed,
'phase': 1,
'overrides': {}, # no overrides, just seed variation
'description': f'A-{band}-baseline-s{seed}',
})
return configs
def group_B_dataset_composition() -> List[Dict[str, Any]]:
"""Noise-type dependence: 6 variants Γ 3 bands = 18 runs.
Tests whether band structure is architecture-driven or data-driven.
"""
variants = {
'B1_all16': list(range(16)),
'B2_gaussian_only': [0],
'B3_structured': [3, 4, 5, 11, 13], # block, gradient, checker, mixed, structural
'B4_heavy_tailed': [6, 7, 10], # cauchy, laplace, exponential (check indices)
'B5_first_half': list(range(8)),
'B6_even_indices': [0, 2, 4, 6, 8, 10, 12, 14],
}
configs = []
for variant_name, types in variants.items():
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'B',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': {'noise_types': types},
'description': f'B-{band}-{variant_name}',
})
return configs
def group_C_optimizer() -> List[Dict[str, Any]]:
"""Optimizer dependence: 4 variants Γ 3 bands = 12 runs.
Tests whether attractor is Adam-specific.
NOTE: LBFGS was originally included as C5 but removed 2026-04-20
after empirical evidence that it is incompatible with the sphere-
normed architecture as currently constructed. LBFGS's flat-space
strong Wolfe line search drives parameters away from the sphere
manifold during line search, producing ill-conditioned SVD inputs.
Symptoms observed: D=16 crashed in torch.linalg.eigh with "failed
to converge β ill-conditioned or too many repeated eigenvalues";
D=8 and D=4 completed but produced NaN MSE (CV measurements at
intermediate batches were valid β 0.3373 MID, 0.9435 HIGH β but
final test MSE was NaN, indicating parameters went non-finite
during training).
This is NOT a finding about LBFGS as an optimizer β it's a finding
about the LBFGS-sphere_norm interaction. Proper test requires
Riemannian LBFGS with constraint-aware line search. See scratchpad
entry 000080 for the dedicated LBFGS engineering pass TODO.
"""
variants = [
('C1_adam', {'optimizer': 'adam', 'lr': 1e-4, 'weight_decay': 0.0}),
('C2_sgd', {'optimizer': 'sgd', 'lr': 1e-2, 'momentum': 0.0}),
('C3_sgd_momentum', {'optimizer': 'sgd', 'lr': 1e-2, 'momentum': 0.9}),
('C4_adamw', {'optimizer': 'adamw', 'lr': 1e-4, 'weight_decay': 0.01}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'C',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'C-{band}-{variant_name}',
})
return configs
def group_D_schedule() -> List[Dict[str, Any]]:
"""LR schedule: 5 variants Γ 3 bands = 15 runs."""
variants = [
('D1_cosine', {'scheduler': 'cosine'}),
('D2_constant', {'scheduler': 'constant'}),
('D3_linear_decay', {'scheduler': 'linear'}),
('D4_warm_restart', {'scheduler': 'cosine_warm_restarts', 'T_0': 1000}),
('D5_one_cycle', {'scheduler': 'one_cycle'}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'D',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'D-{band}-{variant_name}',
})
return configs
def group_E_soft_hand() -> List[Dict[str, Any]]:
"""Soft-hand guidance β PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
Phase 1 E_preview already showed all four variants reach the same band
at 1000 batches (all within 0.0014 CV). The Phase 2 question is NO
LONGER "does the attractor survive" β that's settled β but rather:
"what's the within-attractor reconstruction MSE under each soft-hand
regime over a full epoch?"
Primary comparison: E1 (full soft-hand) vs E2 (pure MSE). If MSE
differs meaningfully, soft-hand is trading reconstruction quality
for geometric coherence at an epoch-scale budget.
4 variants Γ 3 bands Γ 3 seeds = 36 runs.
"""
variants = [
('E1_full_softhand', {'soft_hand': True, 'boost': 0.5, 'cv_penalty': 0.3}),
('E2_pure_mse', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0}),
('E3_measure_only', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0, 'cv_measurement_only': True}),
('E4_hard_cv_penalty', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 1.0, 'hard_cv_target': 0.21}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
for seed in range(3):
configs.append({
'group': 'E',
'variant': variant_name,
'band': band,
'seed': seed,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'overrides': overrides,
'description': f'E-{band}-{variant_name}-s{seed}',
})
return configs
def group_E_subset_phase1() -> List[Dict[str, Any]]:
"""E subset for Phase 1 preview β 1 seed per variant, 1000 batches.
Quick read on whether E2 even approaches the attractor before
committing to full Phase 2 Group E. 4 variants Γ 3 bands = 12 runs.
"""
variants = [
('E1_full_softhand', {'soft_hand': True, 'boost': 0.5, 'cv_penalty': 0.3}),
('E2_pure_mse', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0}),
('E3_measure_only', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0, 'cv_measurement_only': True}),
('E4_hard_cv_penalty', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 1.0, 'hard_cv_target': 0.21}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'E_preview',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'Eprev-{band}-{variant_name}',
})
return configs
def group_F_activation() -> List[Dict[str, Any]]:
"""Activation function: 5 variants Γ 3 bands = 15 runs."""
variants = [
('F1_gelu', {'activation': 'gelu'}),
('F2_relu', {'activation': 'relu'}),
('F3_silu', {'activation': 'silu'}),
('F4_tanh', {'activation': 'tanh'}),
('F5_identity', {'activation': 'identity'}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'F',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'F-{band}-{variant_name}',
})
return configs
def group_G_sphere_norm() -> List[Dict[str, Any]]:
"""Sphere-norm ablation: 4 variants Γ 3 bands = 12 runs.
Expected per framework: G2 (no sphere-norm) reproduces charge-
discharge catastrophe. G3/G4 may or may not preserve the band.
"""
variants = [
('G1_sphere_norm', {'row_norm': 'sphere'}), # baseline, F.normalize(dim=-1)
('G2_no_norm', {'row_norm': 'none'}), # raw M to SVD
('G3_layer_norm', {'row_norm': 'layer_norm'}),
('G4_scale_only', {'row_norm': 'scale_only'}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'G',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'G-{band}-{variant_name}',
})
return configs
def group_H_svd_necessity() -> List[Dict[str, Any]]:
"""SVD necessity β PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
Tests whether learned linear readout can match SVD, and whether
fp64 SVD precision and per-batch SVD are load-bearing.
Staged seed counts based on the question each variant answers:
- H1/H2/H3 (3 seeds): core SVD-vs-linear comparison, needs variance
- H4/H5 (2 seeds): precision/batching questions, binary yes/no
- H6 (1 seed): expected-failure confirmation
Total: 3Γ3 + 3Γ3 + 3Γ3 + 3Γ2 + 3Γ2 + 3Γ1 = 42 runs
"""
variants_full = [ # 3 seeds
('H1_svd_fp64', {'svd': 'fp64'}),
('H2_linear_matched', {'svd': 'none', 'linear_readout': True, 'match_params': True}),
('H3_linear_unmatched', {'svd': 'none', 'linear_readout': True, 'match_params': False}),
]
variants_probe = [ # 2 seeds
('H4_svd_fp32', {'svd': 'fp32'}),
('H5_batch_shared_svd', {'svd': 'batch_shared'}),
]
variants_confirm = [ # 1 seed, expected failure
('H6_no_svd_direct', {'svd': 'none', 'linear_readout': False}),
]
configs = []
for variants, n_seeds in [(variants_full, 3), (variants_probe, 2), (variants_confirm, 1)]:
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
for seed in range(n_seeds):
configs.append({
'group': 'H',
'variant': variant_name,
'band': band,
'seed': seed,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'overrides': overrides,
'description': f'H-{band}-{variant_name}-s{seed}',
})
return configs
def group_L2_lbfgs() -> List[Dict[str, Any]]:
"""LBFGS characterization β PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
Front-loads LBFGS investigation after Phil's isolated test at 100
batches showed LBFGS + pure MSE + no soft-hand reaches the HIGH
attractor (CV 0.869) with better within-attractor reconstruction MSE
(0.0644) than Adam + soft-hand achieves at 30 epochs (0.072).
Phase 2 L2 tests whether this gap holds at epoch scale and whether
MID band shows a similar effect.
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
STIPEND: LOW band (D=16) OMITTED pending LBFGS engineering pass.
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Isolated test in Phase 1 session confirmed LBFGS + sphere_norm +
D=16 crashes torch.linalg.eigh (error code 15, ill-conditioned
Gram matrix). PyTorch LBFGS's flat-space strong Wolfe line search
drives parameters off the sphere manifold, producing degenerate
SVD inputs. Fix requires Riemannian (constraint-aware) line
search β see scratchpad entry 000080 for the engineering pass
TODO. L2-LOW will be runnable once RLBFGS integration lands.
Current scope: MID + HIGH only, pure MSE + no soft-hand
(matching the Phil isolated test configuration that produced
the 0.869/0.0644 data point).
2 bands Γ 3 seeds = 6 runs.
"""
variants = [
('L2_lbfgs_pure_mse', {
'optimizer': 'lbfgs',
'lr': 1.0,
'batch_size': 32, # LBFGS small-batch required for closure stability
'soft_hand': False, # no soft-hand (corrupted Hessian approximation)
'boost': 0.0,
'cv_penalty': 0.0,
}),
]
configs = []
for variant_name, overrides in variants:
for band in ['MID', 'HIGH']: # LOW stipended β see docstring
for seed in range(3):
configs.append({
'group': 'L2',
'variant': variant_name,
'band': band,
'seed': seed,
'phase': 2,
'num_epochs': 1,
'batch_size': 32, # overrides default (LBFGS needs small batch)
'overrides': overrides,
'description': f'L2-{band}-{variant_name}-s{seed}',
})
return configs
def group_I_cross_attention() -> List[Dict[str, Any]]:
"""Cross-attention necessity: 4 variants Γ 3 bands = 12 runs."""
variants = [
('I1_1layer', {'n_cross': 1, 'max_alpha': 0.2}),
('I2_0layers', {'n_cross': 0}),
('I3_2layers', {'n_cross': 2, 'max_alpha': 0.2}),
('I4_unbounded_alpha', {'n_cross': 1, 'max_alpha': 1.0}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'I',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'I-{band}-{variant_name}',
})
return configs
def group_J_capacity_within_LOW() -> List[Dict[str, Any]]:
"""Minimum on-attractor parameter count β LOW band only, 5 variants."""
variants = [
('J1_V64_h64', {'V': 64, 'hidden': 64}), # baseline, 184K
('J2_V32_h32', {'V': 32, 'hidden': 32}), # ~50K
('J3_V16_h32', {'V': 16, 'hidden': 32}), # ~30K
('J4_V64_h32', {'V': 64, 'hidden': 32}), # ~100K
('J5_V128_h128', {'V': 128, 'hidden': 128}), # ~528K
]
configs = []
for variant_name, overrides in variants:
configs.append({
'group': 'J',
'variant': variant_name,
'band': 'LOW',
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'J-LOW-{variant_name}',
})
return configs
def group_K_batch_size() -> List[Dict[str, Any]]:
"""Batch size sensitivity: 4 variants Γ 3 bands = 12 runs."""
variants = [
('K1_bs128', {'batch_size': 128}),
('K2_bs32', {'batch_size': 32}),
('K3_bs512', {'batch_size': 512}),
('K4_bs1024', {'batch_size': 1024}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'K',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'K-{band}-{variant_name}',
})
return configs
def group_L_initialization() -> List[Dict[str, Any]]:
"""Init: 4 variants Γ 3 bands = 12 runs."""
variants = [
('L1_orthogonal', {'init': 'orthogonal'}),
('L2_kaiming', {'init': 'kaiming_normal'}),
('L3_xavier', {'init': 'xavier_uniform'}),
('L4_normal_small', {'init': 'normal_0_02'}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'L',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'L-{band}-{variant_name}',
})
return configs
def group_M_brute_force_sgd() -> List[Dict[str, Any]]:
"""Brute-force SGD stress: 3 variants Γ 3 bands = 9 runs."""
variants = [
('M1_sgd_aggressive', {'optimizer': 'sgd', 'lr': 1e-1, 'momentum': 0.0, 'warmup': 0}),
('M2_sgd_huge_lr', {'optimizer': 'sgd', 'lr': 1.0, 'momentum': 0.0, 'grad_clip': 1.0}),
('M3_sgd_high_momentum',{'optimizer': 'sgd', 'lr': 3e-3, 'momentum': 0.99}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'M',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'M-{band}-{variant_name}',
})
return configs
def group_N_uniformity_diagnostic() -> List[Dict[str, Any]]:
"""Attractor uniformity diagnostic β NOT a standalone group.
Instead, ADDED TO EVERY other variant's post-training analysis:
1. Extract final sphere-normed rows
2. Compute pentachoron CV at n_samples=2000
3. Compare to uniform-sphere prediction for that D
4. Record observed_CV, uniform_CV, deviation in final_report.json
This function returns 0 standalone configs β Group N is a flag
that every other group's runs should include the diagnostic.
"""
return []
# ----------------------------------------------------------------------------
# Full matrix assembly
# ----------------------------------------------------------------------------
def get_phase1_configs() -> List[Dict[str, Any]]:
"""Phase 1 matrix β all band-classification ablations.
Recommended run order (most informative first):
1. Group A (seed replication) β foundational
2. Group G (sphere-norm) β framework verification
3. Group E_preview (soft-hand 1000-batch preview)
4. Group B, C, D, F, I, J, K, L, M β remaining ablations
"""
return (
group_A_seed_replication() # 15 runs
+ group_G_sphere_norm() # 12 runs
+ group_E_subset_phase1() # 12 runs
+ group_B_dataset_composition() # 18 runs
+ group_C_optimizer() # 15 runs
+ group_D_schedule() # 15 runs
+ group_F_activation() # 15 runs
+ group_I_cross_attention() # 12 runs
+ group_J_capacity_within_LOW() # 5 runs
+ group_K_batch_size() # 12 runs
+ group_L_initialization() # 12 runs
+ group_M_brute_force_sgd() # 9 runs
)
def group_P_small_battery_floor() -> List[Dict[str, Any]]:
"""Small-battery floor sweep β PHASE 2 variant with tiny batch budget.
Grid-sweeps architecture at the H2_linear_matched baseline to find
the smallest battery that still reconstructs gaussian within a
reasonable multiplier of the h2-64 floor AND lands in a valid
geometric attractor (CV in MID/HIGH range).
Grid axes:
hidden: {4, 8, 16, 32, 64} 5
V: {2, 4, 8, 16, 32} 5
D: {2, 3, 4} 3
depth: {0, 1} 2
n_cross: {0, 1} 2
optimizer: {'adam', 'lbfgs'} 2
Full product: 5 Γ 5 Γ 3 Γ 2 Γ 2 Γ 2 = 600 runs.
Pins (H2_linear_matched baseline):
svd='none', linear_readout=True, match_params=True
band='HIGH' (patch_size=4, img_size=64)
batch_size=256
batch_limit=20 (5120 samples seen β matches floor-sweep budget)
NOTE: smooth_mid is NOT varied here β PatchSVAE_F_Ablation doesn't
expose it as a parameter. All configs use the PatchSVAE_F_Ablation
default BoundarySmooth. If smooth_mid variation is needed later,
plumb it through the model class and add it as a grid axis.
LIMITATION: cv_of() returns 0 for V<5 (pentachoron volume needs β₯5
points). Vβ{2,4} configs will have observed_sphere_cv=0, cv_ema=0,
and predicted_band='LOW'. This is an architectural constraint of
the geometric validity metric, not a training failure. Use
test_mse_per_noise[0] and train_loss_trajectory as the primary
quality metrics for those configs; CV-based analysis applies only
to Vβ₯8 configs.
Records via run_ablation_config's full report: CV_ema, cv_last,
S0, SD, ratio, erank, observed_sphere_cv, band_deviation,
predicted_band, band_match, params_finite, cv_trajectory,
train_loss_trajectory, test_mse, test_mse_per_noise, plus
per-config wallclock and batches_completed.
"""
configs = []
for hidden in [4, 8, 16, 32, 64]:
for V in [2, 4, 8, 16, 32]:
for D in [2, 3, 4]:
for depth in [0, 1]:
for n_cross in [0, 1]:
for optimizer in ['adam', 'lbfgs']:
variant_name = (
f"P_h{hidden}_V{V}_D{D}_dp{depth}"
f"_nx{n_cross}_{optimizer}"
)
# Per-optimizer LR tuned for the 20-step budget:
# Adam at 1e-4 (Phase-2 default) barely moves in
# 20 steps on small models. LBFGS's line search
# handles its own step sizing; 1.0 is the library
# default for unit-Wolfe-step.
lr = 3e-3 if optimizer == 'adam' else 1.0
configs.append({
'group': 'P',
'variant': variant_name,
'band': 'HIGH',
'seed': 42,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'batch_limit': 20,
'overrides': {
# H2_linear_matched baseline
'svd': 'none',
'linear_readout': True,
'match_params': True,
# Size axes
'hidden': hidden,
'V': V,
'D': D,
'depth': depth,
'n_cross': n_cross,
# Pin n_heads=1: D varies {2,3,4},
# default n_heads=4 would fail D=2,3
'n_heads': 1,
# Optimizer + LR tuned for short budget
'optimizer': optimizer,
'lr': lr,
# Gradient clipping catches LBFGS
# explosions (both initial-step Wolfe
# failures on tiny params and mid-training
# Hessian-approximation corruption on
# depth=1 + n_cross=1 configs). Standard
# defensive practice for small-model
# sweeps; no cost when not triggered.
'grad_clip': 1.0,
# Measure CV every 2 batches (was 50 β
# too coarse for a 20-batch sweep).
'cv_measure_every': 2,
# Pure MSE, no soft-hand (per 000079 β LBFGS
# Hessian corruption avoidance)
'soft_hand': False,
# Training: gaussian only (for floor detection)
'noise_types': [0],
# Testing: all 16 noises, 256 each.
# Separate from training distribution so
# per-noise generalization is measured.
'test_noise_types': list(range(16)),
'test_samples_per_noise': 256,
'test_batch_size': 64,
},
'description': (
f'P-HIGH-{variant_name} '
f'(floor sweep, 20-batch budget)'
),
})
return configs
def group_implicit_solver_A_d5_spherical() -> List[Dict[str, Any]]:
"""Implicit-solver A-set: D=5 spherical reference batteries.
Three configs to test the projective-axis hypothesis at D=5:
A3a: V=16, D=5 β minimal V, may force more antipodal collapses
A3b: V=32, D=5 β direct comparator to H2a (V=32, D=4)
A3c: V=64, D=5 β extra V room, may reduce antipodal pair count
All configs match Q-rank02 (H2a) baseline:
H2_linear_matched: svd=none, linear_readout=True, match_params=True
Adam @ lr=3e-3, depth=0, n_cross=0, n_heads=1
1000 batches, gaussian-only training
Per-noise test on all 16 noise types
Predicted (if 000101 generalizes to D=5):
- All three converge with finite MSE
- All three show projective-uniform distribution on βPβ΄
- Axis count grows with V; antipodal pair count grows with V/D
- Effective rank stays near full (~4.95/5)
A3b is the critical test (matches H2a config except D bumped to 5).
"""
A_CONFIGS = [
# (V, D, label)
(16, 5, 'A3a_V16_D5'),
(32, 5, 'A3b_V32_D5'),
(64, 5, 'A3c_V64_D5'),
]
configs = []
for V, D, label in A_CONFIGS:
variant_name = f"{label}_h64_dp0_nx0_adam"
configs.append({
'group': 'implicit_solver_A',
'variant': variant_name,
'band': 'HIGH', # nominally HIGH β D=5 is a new regime
'seed': 42,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'batch_limit': 1000,
'overrides': {
'svd': 'none',
'linear_readout': True,
'match_params': True,
'hidden': 64,
'V': V,
'D': D,
'depth': 0,
'n_cross': 0,
'n_heads': 1,
'optimizer': 'adam',
'lr': 3e-3,
'grad_clip': 1.0,
'cv_measure_every': 50,
'soft_hand': False,
'noise_types': [0],
'test_noise_types': list(range(16)),
'test_samples_per_noise': 256,
'test_batch_size': 64,
},
'description': (
f'implicit_solver_A-{variant_name} '
f'(D=5 spherical reference, projective probe target)'
),
})
return configs
def get_implicit_solver_A_configs() -> List[Dict[str, Any]]:
"""Implicit-solver A-set Stage 1: D=5 spherical references."""
return group_implicit_solver_A_d5_spherical()
def group_R_packed_polytope_test() -> List[Dict[str, Any]]:
"""Sphere-packing prediction test β does V Γ D matter geometrically?
Hypothesis (from G-Class probe v3): the 32-row Γ D=3 G-Class behavior
(rotating antipodal frame) emerged because 32 points cannot be
uniformly arranged on SΒ² β geometric frustration. When V matches a
natural polytope vertex count for S^(D-1), training should produce
STATIC sphere-solver rows instead.
Three test configs (each predicted to produce H2-LIKE static rows):
- D=4, V=16: 16-cell (4-orthoplex) vertex count on SΒ³
- D=4, V=8: 16-cell again (8 vertices = 4D cross-polytope subset)
or 8-cell (tesseract) β 8 is canonical for both
- D=3, V=20: dodecahedron vertex count on SΒ²
All else matches H2a (Q-rank02): adam, lr=3e-3, depth=0, n_cross=0,
H2_linear_matched (svd=none, linear_readout=True, match_params=True).
1000 batches, gaussian-only training, 16-noise per-noise test.
Predicted result: all three produce row_stability > 0.85, antipodal
pair fraction < 0.55 β i.e. H2-LIKE character on the v3 probe.
"""
POLYTOPE_CONFIGS = [
# (V, D, polytope_name)
(16, 4, '16cell_orthoplex'),
(8, 4, '8cell_or_16cell_subset'),
(20, 3, 'dodecahedron'),
]
configs = []
for V, D, polytope in POLYTOPE_CONFIGS:
variant_name = f"R_h64_V{V}_D{D}_{polytope}_adam"
configs.append({
'group': 'R',
'variant': variant_name,
'band': 'HIGH',
'seed': 42,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'batch_limit': 1000,
'overrides': {
'svd': 'none',
'linear_readout': True,
'match_params': True,
'hidden': 64,
'V': V,
'D': D,
'depth': 0,
'n_cross': 0,
'n_heads': 1,
'optimizer': 'adam',
'lr': 3e-3,
'grad_clip': 1.0,
'cv_measure_every': 50,
'soft_hand': False,
'noise_types': [0],
'test_noise_types': list(range(16)),
'test_samples_per_noise': 256,
'test_batch_size': 64,
},
'description': (
f'R-HIGH-{variant_name} '
f'(packing test, predicted H2-LIKE)'
),
})
return configs
def get_phaseR_configs() -> List[Dict[str, Any]]:
"""Phase R β sphere-packing prediction test (3 configs)."""
return group_R_packed_polytope_test()
def group_Q_h2_candidates() -> List[Dict[str, Any]]:
"""Top-10 P-sweep winners extended to 1000 batches.
These are the 10 configs flagged by the P-sweep analyzer's
continued-training-potential ranking. Each is re-run with the
same architecture and optimizer but with batch_limit=1000 (50Γ
the P sweep's 20-batch budget).
Purpose: answer the classification questions the P sweep couldn't:
- What's the actual convergence floor per config?
- Does Adam catch LBFGS with enough budget? (6 Adam / 4 LBFGS in top 10)
- Where does the loss trajectory flatten?
- Does discrimination ratio sharpen with more training?
- Does final CV land in the valid band (0.13-0.30)?
Results feed into H2 class-rank assignment.
cv_measure_every=50 so we get ~20 CV measurements across the run
(P sweep used 2, which would be 500 measurements at 1000 batches β
too many).
"""
# Top 10 from P-sweep analyzer (ranked by continued_training_potential)
TOP_10 = [
# (hidden, V, D, depth, n_cross, optimizer)
(64, 32, 4, 1, 0, 'lbfgs'), # 1 β 57123 params, P-MSE 0.053
(64, 32, 4, 0, 0, 'adam'), # 2 β 40227 params, P-MSE 0.572
(64, 32, 4, 0, 1, 'adam'), # 3 β 40319 params, P-MSE 0.584
(64, 32, 4, 0, 1, 'lbfgs'), # 4 β 40319 params, P-MSE 0.041
(64, 16, 4, 1, 1, 'lbfgs'), # 5 β 36607 params, P-MSE 0.115
(64, 32, 3, 1, 1, 'adam'), # 6 β 45852 params, P-MSE 0.656
(64, 32, 3, 0, 1, 'adam'), # 7 β 28956 params, P-MSE 0.641
(64, 32, 4, 1, 1, 'adam'), # 8 β 57215 params, P-MSE 0.620
(64, 32, 3, 0, 0, 'adam'), # 9 β 28899 params, P-MSE 0.638
(64, 32, 2, 0, 1, 'adam'), # 10 β 19649 params, P-MSE 0.736
]
configs = []
for rank, (hidden, V, D, depth, n_cross, optimizer) in enumerate(TOP_10, start=1):
variant_name = (
f"Q_rank{rank:02d}_h{hidden}_V{V}_D{D}_dp{depth}"
f"_nx{n_cross}_{optimizer}"
)
# Same LR as P sweep: Adam 3e-3, LBFGS 1.0
lr = 3e-3 if optimizer == 'adam' else 1.0
configs.append({
'group': 'Q',
'variant': variant_name,
'band': 'HIGH',
'seed': 42,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'batch_limit': 1000, # 50Γ the P sweep
'overrides': {
# H2_linear_matched baseline
'svd': 'none',
'linear_readout': True,
'match_params': True,
# Size axes (from P winner)
'hidden': hidden,
'V': V,
'D': D,
'depth': depth,
'n_cross': n_cross,
'n_heads': 1,
# Optimizer
'optimizer': optimizer,
'lr': lr,
'grad_clip': 1.0,
# CV measurement β every 50 gives ~20 measurements
# across the 1000-batch run. P used 2 (too frequent
# at this budget).
'cv_measure_every': 50,
# Pure MSE, no soft-hand
'soft_hand': False,
# Training: gaussian only (matches P sweep)
'noise_types': [0],
# Full 16-noise test at end
'test_noise_types': list(range(16)),
'test_samples_per_noise': 256,
'test_batch_size': 64,
},
'description': (
f'Q-HIGH-{variant_name} '
f'(H2 candidate extended sweep, 1000 batches)'
),
})
return configs
def get_phaseQ_configs() -> List[Dict[str, Any]]:
"""Phase Q β top-10 P winners at 1000 batches for H2 class-rank assignment."""
return group_Q_h2_candidates()
def get_phaseP_configs() -> List[Dict[str, Any]]:
"""Phase P (floor sweep) β 600 configs at 20 batches each."""
return group_P_small_battery_floor()
def get_phase2_configs() -> List[Dict[str, Any]]:
"""Phase 2 matrix β 1 epoch each at batch_size=256, resume-capable.
Revised from original 174-config design after Phase 1 settled the
"does the attractor survive" question. Phase 2 now characterizes
WITHIN-ATTRACTOR behavior over one full epoch (~3900 batches):
- Group E (36 runs): within-attractor MSE under each soft-hand regime
- Group H (42 runs): SVD necessity (vs learned linear readout)
- Group L2 (6 runs): LBFGS within-attractor MSE characterization
(MID + HIGH only; LOW stipended pending RLBFGS
engineering pass β see group_L2_lbfgs docstring)
Total: 84 runs. Intriguing cases can be continued to epoch 3 or 5
using the orchestrator's continue_training() function.
"""
return (
group_E_soft_hand() # 36 runs
+ group_H_svd_necessity() # 42 runs
+ group_L2_lbfgs() # 6 runs
)
def summarize(configs: List[Dict[str, Any]]) -> None:
"""Print a breakdown of the matrix for sanity-check."""
by_group = {}
by_band = {}
by_phase = {}
for c in configs:
by_group[c['group']] = by_group.get(c['group'], 0) + 1
by_band[c['band']] = by_band.get(c['band'], 0) + 1
by_phase[c['phase']] = by_phase.get(c['phase'], 0) + 1
print(f"Total configs: {len(configs)}")
print(f"\nBy group:")
for g, n in sorted(by_group.items()):
print(f" {g}: {n}")
print(f"\nBy band:")
for b, n in sorted(by_band.items()):
print(f" {b}: {n}")
print(f"\nBy phase:")
for p, n in sorted(by_phase.items()):
print(f" Phase {p}: {n}")
if __name__ == '__main__':
print("=" * 60)
print("PHASE 1 MATRIX")
print("=" * 60)
summarize(get_phase1_configs())
print()
print("=" * 60)
print("PHASE 2 MATRIX")
print("=" * 60)
summarize(get_phase2_configs()) |