geolip-svae-implicit-solver-experiments / 002_ablation_configs.py
AbstractPhil's picture
Rename 2_ablation_configs.py to 002_ablation_configs.py
2ccca38 verified
"""
ablation_configs.py
====================
The ablation matrix for the three-band SVAE validation sweep.
Each config is a dict of overrides on the baseline PatchSVAE_F trainer.
The trainer expects:
- band: 'LOW' | 'MID' | 'HIGH' (selects the base architecture)
- variant: unique identifier for this variant within the group
- seed: random seed
- phase: 1 (1000-batch triage) | 2 (30-epoch full)
- overrides: dict of RunConfig field overrides
Three band representatives (kept constant across every test):
LOW: S=64, V=64, D=16, h=64, d=1, patch=16, 184K params, CV target β‰ˆ 0.21
MID: S=64, V=64, D=8, h=64, d=1, patch=16, 183K params, CV target β‰ˆ 0.39
HIGH: S=64, V=32, D=4, h=64, d=1, patch=4, 41K params, CV target β‰ˆ 1.10
Phase 1 early-stop:
- LOW/MID bands: train to batch 1000, record CV_ema, classify band
- HIGH band: train to batch 100, record CV_ema, classify band
Phase 2 full run:
- Group E (soft-hand): 30 epochs, 10 seeds per variant
- Group H (SVD necessity): 30 epochs, 3 seeds per variant
"""
from typing import Dict, List, Any
from dataclasses import dataclass, field, asdict
# ----------------------------------------------------------------------------
# Band representatives β€” the three anchor configs
# ----------------------------------------------------------------------------
BAND_REPS = {
'LOW': {
'img_size': 64,
'V': 64,
'D': 16,
'hidden': 64,
'depth': 1,
'patch_size': 16,
'n_cross': 1,
'expected_cv': 0.21,
'expected_params': 184_000,
},
'MID': {
'img_size': 64,
'V': 64,
'D': 8,
'hidden': 64,
'depth': 1,
'patch_size': 16,
'n_cross': 1,
'expected_cv': 0.39,
'expected_params': 183_000,
},
'HIGH': {
'img_size': 64,
'V': 32,
'D': 4,
'hidden': 64,
'depth': 1,
'patch_size': 4,
'n_cross': 1,
'expected_cv': 1.10,
'expected_params': 41_000,
},
}
def band_classifier(cv_ema: float) -> str:
"""Classify a final CV-EMA value into a band."""
if cv_ema < 0.30:
return 'LOW'
elif cv_ema < 0.55:
return 'MID'
elif cv_ema > 0.80:
return 'HIGH'
return 'UNCLASSIFIED'
def phase1_batch_limit(band: str) -> int:
"""How many batches to train before stopping for Phase 1 band classification."""
if band == 'HIGH':
return 100
return 1000
def phase2_batch_limit(config: Dict[str, Any]) -> int:
"""How many batches per epoch for Phase 2.
Per-config override: if the config specifies 'batch_limit', use it.
This allows the floor sweep (P group) to cap at a few dozen batches
without changing defaults for existing phase-2 configs.
Default behavior (unchanged):
- Adam at batch_size=256: 1_000_000 / 256 β‰ˆ 3900 batches
- LBFGS at batch_size=32: normally 31250 batches, but LBFGS
does 20 inner iterations per outer step so ~40k gradient steps
per batch β€” we cap at 2000 outer batches = ~40k gradient steps
which is plenty for within-attractor convergence
The batch_size is read from the config (Phase 2 configs include
an explicit batch_size field).
"""
# Per-config explicit batch_limit takes precedence
if 'batch_limit' in config:
return config['batch_limit']
overrides = config.get('overrides', {})
if overrides.get('optimizer') == 'lbfgs':
return 2000 # cap for LBFGS wallclock
batch_size = config.get('batch_size', 256)
return 1_000_000 // batch_size
# ----------------------------------------------------------------------------
# Ablation group definitions
# ----------------------------------------------------------------------------
def group_A_seed_replication() -> List[Dict[str, Any]]:
"""Reproducibility: 5 seeds Γ— 3 bands = 15 runs.
Tests whether each band reproducibly appears across random inits.
Acceptance: >=4/5 seeds per band within +/-0.02 of expected CV.
"""
configs = []
for band in ['LOW', 'MID', 'HIGH']:
for seed in range(5):
configs.append({
'group': 'A',
'variant': 'baseline',
'band': band,
'seed': seed,
'phase': 1,
'overrides': {}, # no overrides, just seed variation
'description': f'A-{band}-baseline-s{seed}',
})
return configs
def group_B_dataset_composition() -> List[Dict[str, Any]]:
"""Noise-type dependence: 6 variants Γ— 3 bands = 18 runs.
Tests whether band structure is architecture-driven or data-driven.
"""
variants = {
'B1_all16': list(range(16)),
'B2_gaussian_only': [0],
'B3_structured': [3, 4, 5, 11, 13], # block, gradient, checker, mixed, structural
'B4_heavy_tailed': [6, 7, 10], # cauchy, laplace, exponential (check indices)
'B5_first_half': list(range(8)),
'B6_even_indices': [0, 2, 4, 6, 8, 10, 12, 14],
}
configs = []
for variant_name, types in variants.items():
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'B',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': {'noise_types': types},
'description': f'B-{band}-{variant_name}',
})
return configs
def group_C_optimizer() -> List[Dict[str, Any]]:
"""Optimizer dependence: 4 variants Γ— 3 bands = 12 runs.
Tests whether attractor is Adam-specific.
NOTE: LBFGS was originally included as C5 but removed 2026-04-20
after empirical evidence that it is incompatible with the sphere-
normed architecture as currently constructed. LBFGS's flat-space
strong Wolfe line search drives parameters away from the sphere
manifold during line search, producing ill-conditioned SVD inputs.
Symptoms observed: D=16 crashed in torch.linalg.eigh with "failed
to converge β€” ill-conditioned or too many repeated eigenvalues";
D=8 and D=4 completed but produced NaN MSE (CV measurements at
intermediate batches were valid β€” 0.3373 MID, 0.9435 HIGH β€” but
final test MSE was NaN, indicating parameters went non-finite
during training).
This is NOT a finding about LBFGS as an optimizer β€” it's a finding
about the LBFGS-sphere_norm interaction. Proper test requires
Riemannian LBFGS with constraint-aware line search. See scratchpad
entry 000080 for the dedicated LBFGS engineering pass TODO.
"""
variants = [
('C1_adam', {'optimizer': 'adam', 'lr': 1e-4, 'weight_decay': 0.0}),
('C2_sgd', {'optimizer': 'sgd', 'lr': 1e-2, 'momentum': 0.0}),
('C3_sgd_momentum', {'optimizer': 'sgd', 'lr': 1e-2, 'momentum': 0.9}),
('C4_adamw', {'optimizer': 'adamw', 'lr': 1e-4, 'weight_decay': 0.01}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'C',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'C-{band}-{variant_name}',
})
return configs
def group_D_schedule() -> List[Dict[str, Any]]:
"""LR schedule: 5 variants Γ— 3 bands = 15 runs."""
variants = [
('D1_cosine', {'scheduler': 'cosine'}),
('D2_constant', {'scheduler': 'constant'}),
('D3_linear_decay', {'scheduler': 'linear'}),
('D4_warm_restart', {'scheduler': 'cosine_warm_restarts', 'T_0': 1000}),
('D5_one_cycle', {'scheduler': 'one_cycle'}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'D',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'D-{band}-{variant_name}',
})
return configs
def group_E_soft_hand() -> List[Dict[str, Any]]:
"""Soft-hand guidance β€” PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
Phase 1 E_preview already showed all four variants reach the same band
at 1000 batches (all within 0.0014 CV). The Phase 2 question is NO
LONGER "does the attractor survive" β€” that's settled β€” but rather:
"what's the within-attractor reconstruction MSE under each soft-hand
regime over a full epoch?"
Primary comparison: E1 (full soft-hand) vs E2 (pure MSE). If MSE
differs meaningfully, soft-hand is trading reconstruction quality
for geometric coherence at an epoch-scale budget.
4 variants Γ— 3 bands Γ— 3 seeds = 36 runs.
"""
variants = [
('E1_full_softhand', {'soft_hand': True, 'boost': 0.5, 'cv_penalty': 0.3}),
('E2_pure_mse', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0}),
('E3_measure_only', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0, 'cv_measurement_only': True}),
('E4_hard_cv_penalty', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 1.0, 'hard_cv_target': 0.21}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
for seed in range(3):
configs.append({
'group': 'E',
'variant': variant_name,
'band': band,
'seed': seed,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'overrides': overrides,
'description': f'E-{band}-{variant_name}-s{seed}',
})
return configs
def group_E_subset_phase1() -> List[Dict[str, Any]]:
"""E subset for Phase 1 preview β€” 1 seed per variant, 1000 batches.
Quick read on whether E2 even approaches the attractor before
committing to full Phase 2 Group E. 4 variants Γ— 3 bands = 12 runs.
"""
variants = [
('E1_full_softhand', {'soft_hand': True, 'boost': 0.5, 'cv_penalty': 0.3}),
('E2_pure_mse', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0}),
('E3_measure_only', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0, 'cv_measurement_only': True}),
('E4_hard_cv_penalty', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 1.0, 'hard_cv_target': 0.21}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'E_preview',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'Eprev-{band}-{variant_name}',
})
return configs
def group_F_activation() -> List[Dict[str, Any]]:
"""Activation function: 5 variants Γ— 3 bands = 15 runs."""
variants = [
('F1_gelu', {'activation': 'gelu'}),
('F2_relu', {'activation': 'relu'}),
('F3_silu', {'activation': 'silu'}),
('F4_tanh', {'activation': 'tanh'}),
('F5_identity', {'activation': 'identity'}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'F',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'F-{band}-{variant_name}',
})
return configs
def group_G_sphere_norm() -> List[Dict[str, Any]]:
"""Sphere-norm ablation: 4 variants Γ— 3 bands = 12 runs.
Expected per framework: G2 (no sphere-norm) reproduces charge-
discharge catastrophe. G3/G4 may or may not preserve the band.
"""
variants = [
('G1_sphere_norm', {'row_norm': 'sphere'}), # baseline, F.normalize(dim=-1)
('G2_no_norm', {'row_norm': 'none'}), # raw M to SVD
('G3_layer_norm', {'row_norm': 'layer_norm'}),
('G4_scale_only', {'row_norm': 'scale_only'}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'G',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'G-{band}-{variant_name}',
})
return configs
def group_H_svd_necessity() -> List[Dict[str, Any]]:
"""SVD necessity β€” PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
Tests whether learned linear readout can match SVD, and whether
fp64 SVD precision and per-batch SVD are load-bearing.
Staged seed counts based on the question each variant answers:
- H1/H2/H3 (3 seeds): core SVD-vs-linear comparison, needs variance
- H4/H5 (2 seeds): precision/batching questions, binary yes/no
- H6 (1 seed): expected-failure confirmation
Total: 3Γ—3 + 3Γ—3 + 3Γ—3 + 3Γ—2 + 3Γ—2 + 3Γ—1 = 42 runs
"""
variants_full = [ # 3 seeds
('H1_svd_fp64', {'svd': 'fp64'}),
('H2_linear_matched', {'svd': 'none', 'linear_readout': True, 'match_params': True}),
('H3_linear_unmatched', {'svd': 'none', 'linear_readout': True, 'match_params': False}),
]
variants_probe = [ # 2 seeds
('H4_svd_fp32', {'svd': 'fp32'}),
('H5_batch_shared_svd', {'svd': 'batch_shared'}),
]
variants_confirm = [ # 1 seed, expected failure
('H6_no_svd_direct', {'svd': 'none', 'linear_readout': False}),
]
configs = []
for variants, n_seeds in [(variants_full, 3), (variants_probe, 2), (variants_confirm, 1)]:
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
for seed in range(n_seeds):
configs.append({
'group': 'H',
'variant': variant_name,
'band': band,
'seed': seed,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'overrides': overrides,
'description': f'H-{band}-{variant_name}-s{seed}',
})
return configs
def group_L2_lbfgs() -> List[Dict[str, Any]]:
"""LBFGS characterization β€” PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
Front-loads LBFGS investigation after Phil's isolated test at 100
batches showed LBFGS + pure MSE + no soft-hand reaches the HIGH
attractor (CV 0.869) with better within-attractor reconstruction MSE
(0.0644) than Adam + soft-hand achieves at 30 epochs (0.072).
Phase 2 L2 tests whether this gap holds at epoch scale and whether
MID band shows a similar effect.
═══════════════════════════════════════════════════════════════
STIPEND: LOW band (D=16) OMITTED pending LBFGS engineering pass.
═══════════════════════════════════════════════════════════════
Isolated test in Phase 1 session confirmed LBFGS + sphere_norm +
D=16 crashes torch.linalg.eigh (error code 15, ill-conditioned
Gram matrix). PyTorch LBFGS's flat-space strong Wolfe line search
drives parameters off the sphere manifold, producing degenerate
SVD inputs. Fix requires Riemannian (constraint-aware) line
search β€” see scratchpad entry 000080 for the engineering pass
TODO. L2-LOW will be runnable once RLBFGS integration lands.
Current scope: MID + HIGH only, pure MSE + no soft-hand
(matching the Phil isolated test configuration that produced
the 0.869/0.0644 data point).
2 bands Γ— 3 seeds = 6 runs.
"""
variants = [
('L2_lbfgs_pure_mse', {
'optimizer': 'lbfgs',
'lr': 1.0,
'batch_size': 32, # LBFGS small-batch required for closure stability
'soft_hand': False, # no soft-hand (corrupted Hessian approximation)
'boost': 0.0,
'cv_penalty': 0.0,
}),
]
configs = []
for variant_name, overrides in variants:
for band in ['MID', 'HIGH']: # LOW stipended β€” see docstring
for seed in range(3):
configs.append({
'group': 'L2',
'variant': variant_name,
'band': band,
'seed': seed,
'phase': 2,
'num_epochs': 1,
'batch_size': 32, # overrides default (LBFGS needs small batch)
'overrides': overrides,
'description': f'L2-{band}-{variant_name}-s{seed}',
})
return configs
def group_I_cross_attention() -> List[Dict[str, Any]]:
"""Cross-attention necessity: 4 variants Γ— 3 bands = 12 runs."""
variants = [
('I1_1layer', {'n_cross': 1, 'max_alpha': 0.2}),
('I2_0layers', {'n_cross': 0}),
('I3_2layers', {'n_cross': 2, 'max_alpha': 0.2}),
('I4_unbounded_alpha', {'n_cross': 1, 'max_alpha': 1.0}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'I',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'I-{band}-{variant_name}',
})
return configs
def group_J_capacity_within_LOW() -> List[Dict[str, Any]]:
"""Minimum on-attractor parameter count β€” LOW band only, 5 variants."""
variants = [
('J1_V64_h64', {'V': 64, 'hidden': 64}), # baseline, 184K
('J2_V32_h32', {'V': 32, 'hidden': 32}), # ~50K
('J3_V16_h32', {'V': 16, 'hidden': 32}), # ~30K
('J4_V64_h32', {'V': 64, 'hidden': 32}), # ~100K
('J5_V128_h128', {'V': 128, 'hidden': 128}), # ~528K
]
configs = []
for variant_name, overrides in variants:
configs.append({
'group': 'J',
'variant': variant_name,
'band': 'LOW',
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'J-LOW-{variant_name}',
})
return configs
def group_K_batch_size() -> List[Dict[str, Any]]:
"""Batch size sensitivity: 4 variants Γ— 3 bands = 12 runs."""
variants = [
('K1_bs128', {'batch_size': 128}),
('K2_bs32', {'batch_size': 32}),
('K3_bs512', {'batch_size': 512}),
('K4_bs1024', {'batch_size': 1024}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'K',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'K-{band}-{variant_name}',
})
return configs
def group_L_initialization() -> List[Dict[str, Any]]:
"""Init: 4 variants Γ— 3 bands = 12 runs."""
variants = [
('L1_orthogonal', {'init': 'orthogonal'}),
('L2_kaiming', {'init': 'kaiming_normal'}),
('L3_xavier', {'init': 'xavier_uniform'}),
('L4_normal_small', {'init': 'normal_0_02'}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'L',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'L-{band}-{variant_name}',
})
return configs
def group_M_brute_force_sgd() -> List[Dict[str, Any]]:
"""Brute-force SGD stress: 3 variants Γ— 3 bands = 9 runs."""
variants = [
('M1_sgd_aggressive', {'optimizer': 'sgd', 'lr': 1e-1, 'momentum': 0.0, 'warmup': 0}),
('M2_sgd_huge_lr', {'optimizer': 'sgd', 'lr': 1.0, 'momentum': 0.0, 'grad_clip': 1.0}),
('M3_sgd_high_momentum',{'optimizer': 'sgd', 'lr': 3e-3, 'momentum': 0.99}),
]
configs = []
for variant_name, overrides in variants:
for band in ['LOW', 'MID', 'HIGH']:
configs.append({
'group': 'M',
'variant': variant_name,
'band': band,
'seed': 0,
'phase': 1,
'overrides': overrides,
'description': f'M-{band}-{variant_name}',
})
return configs
def group_N_uniformity_diagnostic() -> List[Dict[str, Any]]:
"""Attractor uniformity diagnostic β€” NOT a standalone group.
Instead, ADDED TO EVERY other variant's post-training analysis:
1. Extract final sphere-normed rows
2. Compute pentachoron CV at n_samples=2000
3. Compare to uniform-sphere prediction for that D
4. Record observed_CV, uniform_CV, deviation in final_report.json
This function returns 0 standalone configs β€” Group N is a flag
that every other group's runs should include the diagnostic.
"""
return []
# ----------------------------------------------------------------------------
# Full matrix assembly
# ----------------------------------------------------------------------------
def get_phase1_configs() -> List[Dict[str, Any]]:
"""Phase 1 matrix β€” all band-classification ablations.
Recommended run order (most informative first):
1. Group A (seed replication) β€” foundational
2. Group G (sphere-norm) β€” framework verification
3. Group E_preview (soft-hand 1000-batch preview)
4. Group B, C, D, F, I, J, K, L, M β€” remaining ablations
"""
return (
group_A_seed_replication() # 15 runs
+ group_G_sphere_norm() # 12 runs
+ group_E_subset_phase1() # 12 runs
+ group_B_dataset_composition() # 18 runs
+ group_C_optimizer() # 15 runs
+ group_D_schedule() # 15 runs
+ group_F_activation() # 15 runs
+ group_I_cross_attention() # 12 runs
+ group_J_capacity_within_LOW() # 5 runs
+ group_K_batch_size() # 12 runs
+ group_L_initialization() # 12 runs
+ group_M_brute_force_sgd() # 9 runs
)
def group_P_small_battery_floor() -> List[Dict[str, Any]]:
"""Small-battery floor sweep β€” PHASE 2 variant with tiny batch budget.
Grid-sweeps architecture at the H2_linear_matched baseline to find
the smallest battery that still reconstructs gaussian within a
reasonable multiplier of the h2-64 floor AND lands in a valid
geometric attractor (CV in MID/HIGH range).
Grid axes:
hidden: {4, 8, 16, 32, 64} 5
V: {2, 4, 8, 16, 32} 5
D: {2, 3, 4} 3
depth: {0, 1} 2
n_cross: {0, 1} 2
optimizer: {'adam', 'lbfgs'} 2
Full product: 5 Γ— 5 Γ— 3 Γ— 2 Γ— 2 Γ— 2 = 600 runs.
Pins (H2_linear_matched baseline):
svd='none', linear_readout=True, match_params=True
band='HIGH' (patch_size=4, img_size=64)
batch_size=256
batch_limit=20 (5120 samples seen β€” matches floor-sweep budget)
NOTE: smooth_mid is NOT varied here β€” PatchSVAE_F_Ablation doesn't
expose it as a parameter. All configs use the PatchSVAE_F_Ablation
default BoundarySmooth. If smooth_mid variation is needed later,
plumb it through the model class and add it as a grid axis.
LIMITATION: cv_of() returns 0 for V<5 (pentachoron volume needs β‰₯5
points). V∈{2,4} configs will have observed_sphere_cv=0, cv_ema=0,
and predicted_band='LOW'. This is an architectural constraint of
the geometric validity metric, not a training failure. Use
test_mse_per_noise[0] and train_loss_trajectory as the primary
quality metrics for those configs; CV-based analysis applies only
to Vβ‰₯8 configs.
Records via run_ablation_config's full report: CV_ema, cv_last,
S0, SD, ratio, erank, observed_sphere_cv, band_deviation,
predicted_band, band_match, params_finite, cv_trajectory,
train_loss_trajectory, test_mse, test_mse_per_noise, plus
per-config wallclock and batches_completed.
"""
configs = []
for hidden in [4, 8, 16, 32, 64]:
for V in [2, 4, 8, 16, 32]:
for D in [2, 3, 4]:
for depth in [0, 1]:
for n_cross in [0, 1]:
for optimizer in ['adam', 'lbfgs']:
variant_name = (
f"P_h{hidden}_V{V}_D{D}_dp{depth}"
f"_nx{n_cross}_{optimizer}"
)
# Per-optimizer LR tuned for the 20-step budget:
# Adam at 1e-4 (Phase-2 default) barely moves in
# 20 steps on small models. LBFGS's line search
# handles its own step sizing; 1.0 is the library
# default for unit-Wolfe-step.
lr = 3e-3 if optimizer == 'adam' else 1.0
configs.append({
'group': 'P',
'variant': variant_name,
'band': 'HIGH',
'seed': 42,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'batch_limit': 20,
'overrides': {
# H2_linear_matched baseline
'svd': 'none',
'linear_readout': True,
'match_params': True,
# Size axes
'hidden': hidden,
'V': V,
'D': D,
'depth': depth,
'n_cross': n_cross,
# Pin n_heads=1: D varies {2,3,4},
# default n_heads=4 would fail D=2,3
'n_heads': 1,
# Optimizer + LR tuned for short budget
'optimizer': optimizer,
'lr': lr,
# Gradient clipping catches LBFGS
# explosions (both initial-step Wolfe
# failures on tiny params and mid-training
# Hessian-approximation corruption on
# depth=1 + n_cross=1 configs). Standard
# defensive practice for small-model
# sweeps; no cost when not triggered.
'grad_clip': 1.0,
# Measure CV every 2 batches (was 50 β€”
# too coarse for a 20-batch sweep).
'cv_measure_every': 2,
# Pure MSE, no soft-hand (per 000079 β€” LBFGS
# Hessian corruption avoidance)
'soft_hand': False,
# Training: gaussian only (for floor detection)
'noise_types': [0],
# Testing: all 16 noises, 256 each.
# Separate from training distribution so
# per-noise generalization is measured.
'test_noise_types': list(range(16)),
'test_samples_per_noise': 256,
'test_batch_size': 64,
},
'description': (
f'P-HIGH-{variant_name} '
f'(floor sweep, 20-batch budget)'
),
})
return configs
def group_implicit_solver_A_d5_spherical() -> List[Dict[str, Any]]:
"""Implicit-solver A-set: D=5 spherical reference batteries.
Three configs to test the projective-axis hypothesis at D=5:
A3a: V=16, D=5 β€” minimal V, may force more antipodal collapses
A3b: V=32, D=5 β€” direct comparator to H2a (V=32, D=4)
A3c: V=64, D=5 β€” extra V room, may reduce antipodal pair count
All configs match Q-rank02 (H2a) baseline:
H2_linear_matched: svd=none, linear_readout=True, match_params=True
Adam @ lr=3e-3, depth=0, n_cross=0, n_heads=1
1000 batches, gaussian-only training
Per-noise test on all 16 noise types
Predicted (if 000101 generalizes to D=5):
- All three converge with finite MSE
- All three show projective-uniform distribution on ℝP⁴
- Axis count grows with V; antipodal pair count grows with V/D
- Effective rank stays near full (~4.95/5)
A3b is the critical test (matches H2a config except D bumped to 5).
"""
A_CONFIGS = [
# (V, D, label)
(16, 5, 'A3a_V16_D5'),
(32, 5, 'A3b_V32_D5'),
(64, 5, 'A3c_V64_D5'),
]
configs = []
for V, D, label in A_CONFIGS:
variant_name = f"{label}_h64_dp0_nx0_adam"
configs.append({
'group': 'implicit_solver_A',
'variant': variant_name,
'band': 'HIGH', # nominally HIGH β€” D=5 is a new regime
'seed': 42,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'batch_limit': 1000,
'overrides': {
'svd': 'none',
'linear_readout': True,
'match_params': True,
'hidden': 64,
'V': V,
'D': D,
'depth': 0,
'n_cross': 0,
'n_heads': 1,
'optimizer': 'adam',
'lr': 3e-3,
'grad_clip': 1.0,
'cv_measure_every': 50,
'soft_hand': False,
'noise_types': [0],
'test_noise_types': list(range(16)),
'test_samples_per_noise': 256,
'test_batch_size': 64,
},
'description': (
f'implicit_solver_A-{variant_name} '
f'(D=5 spherical reference, projective probe target)'
),
})
return configs
def get_implicit_solver_A_configs() -> List[Dict[str, Any]]:
"""Implicit-solver A-set Stage 1: D=5 spherical references."""
return group_implicit_solver_A_d5_spherical()
def group_R_packed_polytope_test() -> List[Dict[str, Any]]:
"""Sphere-packing prediction test β€” does V Γ— D matter geometrically?
Hypothesis (from G-Class probe v3): the 32-row Γ— D=3 G-Class behavior
(rotating antipodal frame) emerged because 32 points cannot be
uniformly arranged on SΒ² β€” geometric frustration. When V matches a
natural polytope vertex count for S^(D-1), training should produce
STATIC sphere-solver rows instead.
Three test configs (each predicted to produce H2-LIKE static rows):
- D=4, V=16: 16-cell (4-orthoplex) vertex count on SΒ³
- D=4, V=8: 16-cell again (8 vertices = 4D cross-polytope subset)
or 8-cell (tesseract) β€” 8 is canonical for both
- D=3, V=20: dodecahedron vertex count on SΒ²
All else matches H2a (Q-rank02): adam, lr=3e-3, depth=0, n_cross=0,
H2_linear_matched (svd=none, linear_readout=True, match_params=True).
1000 batches, gaussian-only training, 16-noise per-noise test.
Predicted result: all three produce row_stability > 0.85, antipodal
pair fraction < 0.55 β€” i.e. H2-LIKE character on the v3 probe.
"""
POLYTOPE_CONFIGS = [
# (V, D, polytope_name)
(16, 4, '16cell_orthoplex'),
(8, 4, '8cell_or_16cell_subset'),
(20, 3, 'dodecahedron'),
]
configs = []
for V, D, polytope in POLYTOPE_CONFIGS:
variant_name = f"R_h64_V{V}_D{D}_{polytope}_adam"
configs.append({
'group': 'R',
'variant': variant_name,
'band': 'HIGH',
'seed': 42,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'batch_limit': 1000,
'overrides': {
'svd': 'none',
'linear_readout': True,
'match_params': True,
'hidden': 64,
'V': V,
'D': D,
'depth': 0,
'n_cross': 0,
'n_heads': 1,
'optimizer': 'adam',
'lr': 3e-3,
'grad_clip': 1.0,
'cv_measure_every': 50,
'soft_hand': False,
'noise_types': [0],
'test_noise_types': list(range(16)),
'test_samples_per_noise': 256,
'test_batch_size': 64,
},
'description': (
f'R-HIGH-{variant_name} '
f'(packing test, predicted H2-LIKE)'
),
})
return configs
def get_phaseR_configs() -> List[Dict[str, Any]]:
"""Phase R β€” sphere-packing prediction test (3 configs)."""
return group_R_packed_polytope_test()
def group_Q_h2_candidates() -> List[Dict[str, Any]]:
"""Top-10 P-sweep winners extended to 1000 batches.
These are the 10 configs flagged by the P-sweep analyzer's
continued-training-potential ranking. Each is re-run with the
same architecture and optimizer but with batch_limit=1000 (50Γ—
the P sweep's 20-batch budget).
Purpose: answer the classification questions the P sweep couldn't:
- What's the actual convergence floor per config?
- Does Adam catch LBFGS with enough budget? (6 Adam / 4 LBFGS in top 10)
- Where does the loss trajectory flatten?
- Does discrimination ratio sharpen with more training?
- Does final CV land in the valid band (0.13-0.30)?
Results feed into H2 class-rank assignment.
cv_measure_every=50 so we get ~20 CV measurements across the run
(P sweep used 2, which would be 500 measurements at 1000 batches β€”
too many).
"""
# Top 10 from P-sweep analyzer (ranked by continued_training_potential)
TOP_10 = [
# (hidden, V, D, depth, n_cross, optimizer)
(64, 32, 4, 1, 0, 'lbfgs'), # 1 β€” 57123 params, P-MSE 0.053
(64, 32, 4, 0, 0, 'adam'), # 2 β€” 40227 params, P-MSE 0.572
(64, 32, 4, 0, 1, 'adam'), # 3 β€” 40319 params, P-MSE 0.584
(64, 32, 4, 0, 1, 'lbfgs'), # 4 β€” 40319 params, P-MSE 0.041
(64, 16, 4, 1, 1, 'lbfgs'), # 5 β€” 36607 params, P-MSE 0.115
(64, 32, 3, 1, 1, 'adam'), # 6 β€” 45852 params, P-MSE 0.656
(64, 32, 3, 0, 1, 'adam'), # 7 β€” 28956 params, P-MSE 0.641
(64, 32, 4, 1, 1, 'adam'), # 8 β€” 57215 params, P-MSE 0.620
(64, 32, 3, 0, 0, 'adam'), # 9 β€” 28899 params, P-MSE 0.638
(64, 32, 2, 0, 1, 'adam'), # 10 β€” 19649 params, P-MSE 0.736
]
configs = []
for rank, (hidden, V, D, depth, n_cross, optimizer) in enumerate(TOP_10, start=1):
variant_name = (
f"Q_rank{rank:02d}_h{hidden}_V{V}_D{D}_dp{depth}"
f"_nx{n_cross}_{optimizer}"
)
# Same LR as P sweep: Adam 3e-3, LBFGS 1.0
lr = 3e-3 if optimizer == 'adam' else 1.0
configs.append({
'group': 'Q',
'variant': variant_name,
'band': 'HIGH',
'seed': 42,
'phase': 2,
'num_epochs': 1,
'batch_size': 256,
'batch_limit': 1000, # 50Γ— the P sweep
'overrides': {
# H2_linear_matched baseline
'svd': 'none',
'linear_readout': True,
'match_params': True,
# Size axes (from P winner)
'hidden': hidden,
'V': V,
'D': D,
'depth': depth,
'n_cross': n_cross,
'n_heads': 1,
# Optimizer
'optimizer': optimizer,
'lr': lr,
'grad_clip': 1.0,
# CV measurement β€” every 50 gives ~20 measurements
# across the 1000-batch run. P used 2 (too frequent
# at this budget).
'cv_measure_every': 50,
# Pure MSE, no soft-hand
'soft_hand': False,
# Training: gaussian only (matches P sweep)
'noise_types': [0],
# Full 16-noise test at end
'test_noise_types': list(range(16)),
'test_samples_per_noise': 256,
'test_batch_size': 64,
},
'description': (
f'Q-HIGH-{variant_name} '
f'(H2 candidate extended sweep, 1000 batches)'
),
})
return configs
def get_phaseQ_configs() -> List[Dict[str, Any]]:
"""Phase Q β€” top-10 P winners at 1000 batches for H2 class-rank assignment."""
return group_Q_h2_candidates()
def get_phaseP_configs() -> List[Dict[str, Any]]:
"""Phase P (floor sweep) β€” 600 configs at 20 batches each."""
return group_P_small_battery_floor()
def get_phase2_configs() -> List[Dict[str, Any]]:
"""Phase 2 matrix β€” 1 epoch each at batch_size=256, resume-capable.
Revised from original 174-config design after Phase 1 settled the
"does the attractor survive" question. Phase 2 now characterizes
WITHIN-ATTRACTOR behavior over one full epoch (~3900 batches):
- Group E (36 runs): within-attractor MSE under each soft-hand regime
- Group H (42 runs): SVD necessity (vs learned linear readout)
- Group L2 (6 runs): LBFGS within-attractor MSE characterization
(MID + HIGH only; LOW stipended pending RLBFGS
engineering pass β€” see group_L2_lbfgs docstring)
Total: 84 runs. Intriguing cases can be continued to epoch 3 or 5
using the orchestrator's continue_training() function.
"""
return (
group_E_soft_hand() # 36 runs
+ group_H_svd_necessity() # 42 runs
+ group_L2_lbfgs() # 6 runs
)
def summarize(configs: List[Dict[str, Any]]) -> None:
"""Print a breakdown of the matrix for sanity-check."""
by_group = {}
by_band = {}
by_phase = {}
for c in configs:
by_group[c['group']] = by_group.get(c['group'], 0) + 1
by_band[c['band']] = by_band.get(c['band'], 0) + 1
by_phase[c['phase']] = by_phase.get(c['phase'], 0) + 1
print(f"Total configs: {len(configs)}")
print(f"\nBy group:")
for g, n in sorted(by_group.items()):
print(f" {g}: {n}")
print(f"\nBy band:")
for b, n in sorted(by_band.items()):
print(f" {b}: {n}")
print(f"\nBy phase:")
for p, n in sorted(by_phase.items()):
print(f" Phase {p}: {n}")
if __name__ == '__main__':
print("=" * 60)
print("PHASE 1 MATRIX")
print("=" * 60)
summarize(get_phase1_configs())
print()
print("=" * 60)
print("PHASE 2 MATRIX")
print("=" * 60)
summarize(get_phase2_configs())