{
  "table3_reference": {
    "LLM": {
      "n": 3138,
      "max_mAP": 0.727,
      "space": "expanded"
    },
    "Random_exp": {
      "n": 394,
      "max_mAP": 0.737,
      "space": "expanded"
    },
    "TPE_exp": {
      "n": 394,
      "max_mAP": 0.694,
      "space": "expanded"
    },
    "SMAC_exp": {
      "n": 179,
      "max_mAP": 0.674,
      "space": "expanded"
    },
    "Random_core": {
      "n": 619,
      "max_mAP": 0.702,
      "space": "core"
    },
    "TPE_core": {
      "n": 621,
      "max_mAP": 0.696,
      "space": "core"
    },
    "BOHB_core": {
      "n": 512,
      "max_mAP": 0.702,
      "space": "core"
    }
  },
  "expected_max_correction": {
    "LLM": {
      "n": 3138,
      "observed_max": 0.727,
      "null_expected_max": 0.7332,
      "null_std_max": 0.005,
      "null_p_value": 0.9035,
      "gumbel_expected_max": 0.8158,
      "excess_over_null": -0.0062,
      "z_score": -1.26
    },
    "Random_exp": {
      "n": 394,
      "observed_max": 0.737,
      "null_expected_max": 0.7162,
      "null_std_max": 0.0127,
      "null_p_value": 0.0913,
      "gumbel_expected_max": 0.7848,
      "excess_over_null": 0.0208,
      "z_score": 1.64
    },
    "TPE_exp": {
      "n": 394,
      "observed_max": 0.694,
      "null_expected_max": 0.7163,
      "null_std_max": 0.0127,
      "null_p_value": 0.9687,
      "gumbel_expected_max": 0.7848,
      "excess_over_null": -0.0223,
      "z_score": -1.75
    },
    "SMAC_exp": {
      "n": 179,
      "observed_max": 0.674,
      "null_expected_max": 0.7064,
      "null_std_max": 0.0149,
      "null_p_value": 0.994,
      "gumbel_expected_max": 0.7715,
      "excess_over_null": -0.0324,
      "z_score": -2.17
    },
    "Random_core": {
      "n": 619,
      "observed_max": 0.702,
      "null_expected_max": 0.7211,
      "null_std_max": 0.0111,
      "null_p_value": 0.9605,
      "gumbel_expected_max": 0.7919,
      "excess_over_null": -0.0191,
      "z_score": -1.72
    },
    "TPE_core": {
      "n": 621,
      "observed_max": 0.696,
      "null_expected_max": 0.7209,
      "null_std_max": 0.0112,
      "null_p_value": 0.9938,
      "gumbel_expected_max": 0.792,
      "excess_over_null": -0.0249,
      "z_score": -2.23
    },
    "BOHB_core": {
      "n": 512,
      "observed_max": 0.702,
      "null_expected_max": 0.7189,
      "null_std_max": 0.012,
      "null_p_value": 0.9243,
      "gumbel_expected_max": 0.7889,
      "excess_over_null": -0.0169,
      "z_score": -1.41
    }
  },
  "topk_enrichment": {
    "top_10": {
      "k": 10,
      "by_policy": {
        "LLM": 0.4,
        "Random_exp": 0.3,
        "BOHB_core": 0.2,
        "Random_core": 0.1
      },
      "by_space": {
        "expanded": 0.7,
        "core": 0.3
      },
      "threshold_mAP": 0.6969
    },
    "top_25": {
      "k": 25,
      "by_policy": {
        "Random_exp": 0.28,
        "LLM": 0.24,
        "BOHB_core": 0.24,
        "TPE_core": 0.12,
        "Random_core": 0.08,
        "TPE_exp": 0.04
      },
      "by_space": {
        "expanded": 0.56,
        "core": 0.44
      },
      "threshold_mAP": 0.6895
    },
    "top_50": {
      "k": 50,
      "by_policy": {
        "Random_exp": 0.26,
        "LLM": 0.24,
        "BOHB_core": 0.24,
        "TPE_core": 0.18,
        "Random_core": 0.06,
        "TPE_exp": 0.02
      },
      "by_space": {
        "expanded": 0.52,
        "core": 0.48
      },
      "threshold_mAP": 0.6758
    },
    "top_100": {
      "k": 100,
      "by_policy": {
        "LLM": 0.27,
        "Random_exp": 0.21,
        "BOHB_core": 0.2,
        "TPE_core": 0.15,
        "Random_core": 0.13,
        "TPE_exp": 0.03,
        "SMAC_exp": 0.01
      },
      "by_space": {
        "expanded": 0.52,
        "core": 0.48
      },
      "threshold_mAP": 0.6681
    }
  },
  "sample_matched_comparison": {
    "n=179 (vs SMAC)": {
      "target_n": 179,
      "llm_subsample_mean_max": 0.6809,
      "llm_subsample_std_max": 0.0175,
      "llm_subsample_p5": 0.6591,
      "llm_subsample_p50": 0.6769,
      "llm_subsample_p95": 0.727
    },
    "n=394 (vs Random)": {
      "target_n": 394,
      "llm_subsample_mean_max": 0.6922,
      "llm_subsample_std_max": 0.0177,
      "llm_subsample_p5": 0.6693,
      "llm_subsample_p50": 0.6909,
      "llm_subsample_p95": 0.727
    },
    "n=512 (vs BOHB)": {
      "target_n": 512,
      "llm_subsample_mean_max": 0.6964,
      "llm_subsample_std_max": 0.0173,
      "llm_subsample_p5": 0.6716,
      "llm_subsample_p50": 0.6969,
      "llm_subsample_p95": 0.727
    },
    "n=619 (vs Rand_core)": {
      "target_n": 619,
      "llm_subsample_mean_max": 0.6994,
      "llm_subsample_std_max": 0.0167,
      "llm_subsample_p5": 0.673,
      "llm_subsample_p50": 0.6969,
      "llm_subsample_p95": 0.727
    },
    "llm_394_vs_random_exp": {
      "random_exp_max": 0.737,
      "llm_mean_max_at_394": 0.6922,
      "p_llm_exceeds_random": 0.0
    },
    "smac_bootstrap_max": {
      "observed_max": 0.674,
      "bootstrap_mean_max": 0.6698,
      "bootstrap_std_max": 0.0042,
      "bootstrap_95ci": [
        0.663,
        0.6743
      ]
    }
  },
  "analytical_expected_max": {
    "LLM": {
      "n": 3138,
      "gumbel_factor": 3.4375,
      "relative_advantage_vs_394": 0.6049
    },
    "Random_exp": {
      "n": 394,
      "gumbel_factor": 2.8327,
      "relative_advantage_vs_394": 0.0
    },
    "TPE_exp": {
      "n": 394,
      "gumbel_factor": 2.8327,
      "relative_advantage_vs_394": 0.0
    },
    "SMAC_exp": {
      "n": 179,
      "gumbel_factor": 2.5725,
      "relative_advantage_vs_394": -0.2601
    },
    "Random_core": {
      "n": 619,
      "gumbel_factor": 2.9731,
      "relative_advantage_vs_394": 0.1405
    },
    "TPE_core": {
      "n": 621,
      "gumbel_factor": 2.9741,
      "relative_advantage_vs_394": 0.1415
    },
    "BOHB_core": {
      "n": 512,
      "gumbel_factor": 2.9148,
      "relative_advantage_vs_394": 0.0821
    },
    "sigma_sensitivity": {
      "advantage_in_sigma": 0.6049,
      "mAP_advantage_by_sigma": {
        "0.02": 0.0121,
        "0.03": 0.0181,
        "0.05": 0.0302,
        "0.07": 0.0423
      }
    },
    "observed_gap": -0.01,
    "interpretation": "LLM max (0.727) < Random_exp max (0.737) despite 8x more samples. Under any expected-max correction, larger n should produce HIGHER max, so the LLM's shortfall is even more pronounced after correction. Random_exp's 0.737 from only 394 samples implies a genuinely better right tail in the expanded space (non-VJepa2 configs)."
  },
  "distribution_quality": {
    "LLM": {
      "n": 3136,
      "mean": 0.582,
      "std": 0.0534,
      "median": 0.5931,
      "p25": 0.5622,
      "p75": 0.6175,
      "p90": 0.6373,
      "p95": 0.6484,
      "max": 0.727,
      "frac_above_0.65": 0.044,
      "frac_above_0.68": 0.0045,
      "frac_above_0.7": 0.0013,
      "frac_above_0.72": 0.0006
    },
    "Random_exp": {
      "n": 394,
      "mean": 0.5842,
      "std": 0.0533,
      "median": 0.588,
      "p25": 0.5467,
      "p75": 0.6221,
      "p90": 0.6483,
      "p95": 0.6608,
      "max": 0.737,
      "frac_above_0.65": 0.0939,
      "frac_above_0.68": 0.0228,
      "frac_above_0.7": 0.0152,
      "frac_above_0.72": 0.0051
    },
    "TPE_exp": {
      "n": 394,
      "mean": 0.5618,
      "std": 0.0516,
      "median": 0.5619,
      "p25": 0.5291,
      "p75": 0.595,
      "p90": 0.6248,
      "p95": 0.6494,
      "max": 0.6935,
      "frac_above_0.65": 0.0508,
      "frac_above_0.68": 0.0051,
      "frac_above_0.7": 0.0,
      "frac_above_0.72": 0.0
    },
    "SMAC_exp": {
      "n": 179,
      "mean": 0.6315,
      "std": 0.0196,
      "median": 0.6345,
      "p25": 0.6216,
      "p75": 0.6423,
      "p90": 0.6557,
      "p95": 0.6596,
      "max": 0.6743,
      "frac_above_0.65": 0.1676,
      "frac_above_0.68": 0.0,
      "frac_above_0.7": 0.0,
      "frac_above_0.72": 0.0
    }
  },
  "reconstruction_sensitivity": {
    "std_mult_0.5": {
      "multiplier": 0.5,
      "mean_max": 0.7196,
      "std_max": 0.0049,
      "p_exceed_random": 0.0
    },
    "std_mult_0.75": {
      "multiplier": 0.75,
      "mean_max": 0.7186,
      "std_max": 0.0065,
      "p_exceed_random": 0.0
    },
    "std_mult_1.0": {
      "multiplier": 1.0,
      "mean_max": 0.7158,
      "std_max": 0.0075,
      "p_exceed_random": 0.0
    },
    "std_mult_1.5": {
      "multiplier": 1.5,
      "mean_max": 0.7023,
      "std_max": 0.0138,
      "p_exceed_random": 0.0
    },
    "std_mult_2.0": {
      "multiplier": 2.0,
      "mean_max": 0.6616,
      "std_max": 0.0295,
      "p_exceed_random": 0.0
    },
    "assumption_free_findings": {
      "order_statistics": "LLM (n=3138) should have ~0.6 sigma Gumbel advantage over Random (n=394). For any reasonable sigma, LLM should exceed Random. It does not.",
      "smac_real_data": "SMAC (n=179, real data): max=0.674, mean=0.632, std=0.020. Tight distribution with low ceiling, confirming BO struggles with the saturated proxy.",
      "raw_observation": "LLM max (0.727) < Random_exp max (0.737) with 8x more samples. This is a model-free observation that needs no reconstruction."
    }
  },
  "summary": {
    "key_finding": "Oracle-selection bias works AGAINST the LLM: with 8x more samples, it should achieve a higher max than Random_exp, but it does not (0.727 < 0.737). After expected-max correction, the LLM's shortfall is ~0.025 mAP, confirming that the expanded space quality (SSC) rather than optimizer effectiveness (WSO) determines the ceiling.",
    "oracle_bias_direction": "favors_null_against_llm",
    "corrected_comparison": "Even with order-statistics correction for n=3138 vs n=394, Random_exp outperforms LLM, confirming the space-quality thesis."
  }
}