| { |
| "table3_reference": { |
| "LLM": { |
| "n": 3138, |
| "max_mAP": 0.727, |
| "space": "expanded" |
| }, |
| "Random_exp": { |
| "n": 394, |
| "max_mAP": 0.737, |
| "space": "expanded" |
| }, |
| "TPE_exp": { |
| "n": 394, |
| "max_mAP": 0.694, |
| "space": "expanded" |
| }, |
| "SMAC_exp": { |
| "n": 179, |
| "max_mAP": 0.674, |
| "space": "expanded" |
| }, |
| "Random_core": { |
| "n": 619, |
| "max_mAP": 0.702, |
| "space": "core" |
| }, |
| "TPE_core": { |
| "n": 621, |
| "max_mAP": 0.696, |
| "space": "core" |
| }, |
| "BOHB_core": { |
| "n": 512, |
| "max_mAP": 0.702, |
| "space": "core" |
| } |
| }, |
| "expected_max_correction": { |
| "LLM": { |
| "n": 3138, |
| "observed_max": 0.727, |
| "null_expected_max": 0.7332, |
| "null_std_max": 0.005, |
| "null_p_value": 0.9035, |
| "gumbel_expected_max": 0.8158, |
| "excess_over_null": -0.0062, |
| "z_score": -1.26 |
| }, |
| "Random_exp": { |
| "n": 394, |
| "observed_max": 0.737, |
| "null_expected_max": 0.7162, |
| "null_std_max": 0.0127, |
| "null_p_value": 0.0913, |
| "gumbel_expected_max": 0.7848, |
| "excess_over_null": 0.0208, |
| "z_score": 1.64 |
| }, |
| "TPE_exp": { |
| "n": 394, |
| "observed_max": 0.694, |
| "null_expected_max": 0.7163, |
| "null_std_max": 0.0127, |
| "null_p_value": 0.9687, |
| "gumbel_expected_max": 0.7848, |
| "excess_over_null": -0.0223, |
| "z_score": -1.75 |
| }, |
| "SMAC_exp": { |
| "n": 179, |
| "observed_max": 0.674, |
| "null_expected_max": 0.7064, |
| "null_std_max": 0.0149, |
| "null_p_value": 0.994, |
| "gumbel_expected_max": 0.7715, |
| "excess_over_null": -0.0324, |
| "z_score": -2.17 |
| }, |
| "Random_core": { |
| "n": 619, |
| "observed_max": 0.702, |
| "null_expected_max": 0.7211, |
| "null_std_max": 0.0111, |
| "null_p_value": 0.9605, |
| "gumbel_expected_max": 0.7919, |
| "excess_over_null": -0.0191, |
| "z_score": -1.72 |
| }, |
| "TPE_core": { |
| "n": 621, |
| "observed_max": 0.696, |
| "null_expected_max": 0.7209, |
| "null_std_max": 0.0112, |
| "null_p_value": 0.9938, |
| "gumbel_expected_max": 0.792, |
| "excess_over_null": -0.0249, |
| "z_score": -2.23 |
| }, |
| "BOHB_core": { |
| "n": 512, |
| "observed_max": 0.702, |
| "null_expected_max": 0.7189, |
| "null_std_max": 0.012, |
| "null_p_value": 0.9243, |
| "gumbel_expected_max": 0.7889, |
| "excess_over_null": -0.0169, |
| "z_score": -1.41 |
| } |
| }, |
| "topk_enrichment": { |
| "top_10": { |
| "k": 10, |
| "by_policy": { |
| "LLM": 0.4, |
| "Random_exp": 0.3, |
| "BOHB_core": 0.2, |
| "Random_core": 0.1 |
| }, |
| "by_space": { |
| "expanded": 0.7, |
| "core": 0.3 |
| }, |
| "threshold_mAP": 0.6969 |
| }, |
| "top_25": { |
| "k": 25, |
| "by_policy": { |
| "Random_exp": 0.28, |
| "LLM": 0.24, |
| "BOHB_core": 0.24, |
| "TPE_core": 0.12, |
| "Random_core": 0.08, |
| "TPE_exp": 0.04 |
| }, |
| "by_space": { |
| "expanded": 0.56, |
| "core": 0.44 |
| }, |
| "threshold_mAP": 0.6895 |
| }, |
| "top_50": { |
| "k": 50, |
| "by_policy": { |
| "Random_exp": 0.26, |
| "LLM": 0.24, |
| "BOHB_core": 0.24, |
| "TPE_core": 0.18, |
| "Random_core": 0.06, |
| "TPE_exp": 0.02 |
| }, |
| "by_space": { |
| "expanded": 0.52, |
| "core": 0.48 |
| }, |
| "threshold_mAP": 0.6758 |
| }, |
| "top_100": { |
| "k": 100, |
| "by_policy": { |
| "LLM": 0.27, |
| "Random_exp": 0.21, |
| "BOHB_core": 0.2, |
| "TPE_core": 0.15, |
| "Random_core": 0.13, |
| "TPE_exp": 0.03, |
| "SMAC_exp": 0.01 |
| }, |
| "by_space": { |
| "expanded": 0.52, |
| "core": 0.48 |
| }, |
| "threshold_mAP": 0.6681 |
| } |
| }, |
| "sample_matched_comparison": { |
| "n=179 (vs SMAC)": { |
| "target_n": 179, |
| "llm_subsample_mean_max": 0.6809, |
| "llm_subsample_std_max": 0.0175, |
| "llm_subsample_p5": 0.6591, |
| "llm_subsample_p50": 0.6769, |
| "llm_subsample_p95": 0.727 |
| }, |
| "n=394 (vs Random)": { |
| "target_n": 394, |
| "llm_subsample_mean_max": 0.6922, |
| "llm_subsample_std_max": 0.0177, |
| "llm_subsample_p5": 0.6693, |
| "llm_subsample_p50": 0.6909, |
| "llm_subsample_p95": 0.727 |
| }, |
| "n=512 (vs BOHB)": { |
| "target_n": 512, |
| "llm_subsample_mean_max": 0.6964, |
| "llm_subsample_std_max": 0.0173, |
| "llm_subsample_p5": 0.6716, |
| "llm_subsample_p50": 0.6969, |
| "llm_subsample_p95": 0.727 |
| }, |
| "n=619 (vs Rand_core)": { |
| "target_n": 619, |
| "llm_subsample_mean_max": 0.6994, |
| "llm_subsample_std_max": 0.0167, |
| "llm_subsample_p5": 0.673, |
| "llm_subsample_p50": 0.6969, |
| "llm_subsample_p95": 0.727 |
| }, |
| "llm_394_vs_random_exp": { |
| "random_exp_max": 0.737, |
| "llm_mean_max_at_394": 0.6922, |
| "p_llm_exceeds_random": 0.0 |
| }, |
| "smac_bootstrap_max": { |
| "observed_max": 0.674, |
| "bootstrap_mean_max": 0.6698, |
| "bootstrap_std_max": 0.0042, |
| "bootstrap_95ci": [ |
| 0.663, |
| 0.6743 |
| ] |
| } |
| }, |
| "analytical_expected_max": { |
| "LLM": { |
| "n": 3138, |
| "gumbel_factor": 3.4375, |
| "relative_advantage_vs_394": 0.6049 |
| }, |
| "Random_exp": { |
| "n": 394, |
| "gumbel_factor": 2.8327, |
| "relative_advantage_vs_394": 0.0 |
| }, |
| "TPE_exp": { |
| "n": 394, |
| "gumbel_factor": 2.8327, |
| "relative_advantage_vs_394": 0.0 |
| }, |
| "SMAC_exp": { |
| "n": 179, |
| "gumbel_factor": 2.5725, |
| "relative_advantage_vs_394": -0.2601 |
| }, |
| "Random_core": { |
| "n": 619, |
| "gumbel_factor": 2.9731, |
| "relative_advantage_vs_394": 0.1405 |
| }, |
| "TPE_core": { |
| "n": 621, |
| "gumbel_factor": 2.9741, |
| "relative_advantage_vs_394": 0.1415 |
| }, |
| "BOHB_core": { |
| "n": 512, |
| "gumbel_factor": 2.9148, |
| "relative_advantage_vs_394": 0.0821 |
| }, |
| "sigma_sensitivity": { |
| "advantage_in_sigma": 0.6049, |
| "mAP_advantage_by_sigma": { |
| "0.02": 0.0121, |
| "0.03": 0.0181, |
| "0.05": 0.0302, |
| "0.07": 0.0423 |
| } |
| }, |
| "observed_gap": -0.01, |
| "interpretation": "LLM max (0.727) < Random_exp max (0.737) despite 8x more samples. Under any expected-max correction, larger n should produce HIGHER max, so the LLM's shortfall is even more pronounced after correction. Random_exp's 0.737 from only 394 samples implies a genuinely better right tail in the expanded space (non-VJepa2 configs)." |
| }, |
| "distribution_quality": { |
| "LLM": { |
| "n": 3136, |
| "mean": 0.582, |
| "std": 0.0534, |
| "median": 0.5931, |
| "p25": 0.5622, |
| "p75": 0.6175, |
| "p90": 0.6373, |
| "p95": 0.6484, |
| "max": 0.727, |
| "frac_above_0.65": 0.044, |
| "frac_above_0.68": 0.0045, |
| "frac_above_0.7": 0.0013, |
| "frac_above_0.72": 0.0006 |
| }, |
| "Random_exp": { |
| "n": 394, |
| "mean": 0.5842, |
| "std": 0.0533, |
| "median": 0.588, |
| "p25": 0.5467, |
| "p75": 0.6221, |
| "p90": 0.6483, |
| "p95": 0.6608, |
| "max": 0.737, |
| "frac_above_0.65": 0.0939, |
| "frac_above_0.68": 0.0228, |
| "frac_above_0.7": 0.0152, |
| "frac_above_0.72": 0.0051 |
| }, |
| "TPE_exp": { |
| "n": 394, |
| "mean": 0.5618, |
| "std": 0.0516, |
| "median": 0.5619, |
| "p25": 0.5291, |
| "p75": 0.595, |
| "p90": 0.6248, |
| "p95": 0.6494, |
| "max": 0.6935, |
| "frac_above_0.65": 0.0508, |
| "frac_above_0.68": 0.0051, |
| "frac_above_0.7": 0.0, |
| "frac_above_0.72": 0.0 |
| }, |
| "SMAC_exp": { |
| "n": 179, |
| "mean": 0.6315, |
| "std": 0.0196, |
| "median": 0.6345, |
| "p25": 0.6216, |
| "p75": 0.6423, |
| "p90": 0.6557, |
| "p95": 0.6596, |
| "max": 0.6743, |
| "frac_above_0.65": 0.1676, |
| "frac_above_0.68": 0.0, |
| "frac_above_0.7": 0.0, |
| "frac_above_0.72": 0.0 |
| } |
| }, |
| "reconstruction_sensitivity": { |
| "std_mult_0.5": { |
| "multiplier": 0.5, |
| "mean_max": 0.7196, |
| "std_max": 0.0049, |
| "p_exceed_random": 0.0 |
| }, |
| "std_mult_0.75": { |
| "multiplier": 0.75, |
| "mean_max": 0.7186, |
| "std_max": 0.0065, |
| "p_exceed_random": 0.0 |
| }, |
| "std_mult_1.0": { |
| "multiplier": 1.0, |
| "mean_max": 0.7158, |
| "std_max": 0.0075, |
| "p_exceed_random": 0.0 |
| }, |
| "std_mult_1.5": { |
| "multiplier": 1.5, |
| "mean_max": 0.7023, |
| "std_max": 0.0138, |
| "p_exceed_random": 0.0 |
| }, |
| "std_mult_2.0": { |
| "multiplier": 2.0, |
| "mean_max": 0.6616, |
| "std_max": 0.0295, |
| "p_exceed_random": 0.0 |
| }, |
| "assumption_free_findings": { |
| "order_statistics": "LLM (n=3138) should have ~0.6 sigma Gumbel advantage over Random (n=394). For any reasonable sigma, LLM should exceed Random. It does not.", |
| "smac_real_data": "SMAC (n=179, real data): max=0.674, mean=0.632, std=0.020. Tight distribution with low ceiling, confirming BO struggles with the saturated proxy.", |
| "raw_observation": "LLM max (0.727) < Random_exp max (0.737) with 8x more samples. This is a model-free observation that needs no reconstruction." |
| } |
| }, |
| "summary": { |
| "key_finding": "Oracle-selection bias works AGAINST the LLM: with 8x more samples, it should achieve a higher max than Random_exp, but it does not (0.727 < 0.737). After expected-max correction, the LLM's shortfall is ~0.025 mAP, confirming that the expanded space quality (SSC) rather than optimizer effectiveness (WSO) determines the ceiling.", |
| "oracle_bias_direction": "favors_null_against_llm", |
| "corrected_comparison": "Even with order-statistics correction for n=3138 vs n=394, Random_exp outperforms LLM, confirming the space-quality thesis." |
| } |
| } |