Spaces:
Running
Running
File size: 6,232 Bytes
f0f3d44 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | """
Compute CLD (Compact Letter Display) letters and verify Beta posteriors
for all experiments using the TRI STEP sequential testing framework.
Parameters:
- global_confidence_level = 0.90 (Ξ±=0.10)
- n_max = 50
- shuffle = False (each rollout is independent)
- 11 experiments β C(11,2) = 55 pairwise comparisons β Bonferroni correction
"""
import json
import numpy as np
from scipy import stats
from sequentialized_barnard_tests.tools.plotting import compare_success_and_get_cld
EXPERIMENTS = {
"1.1 Ο0": {"total": [8, 20], "L1": [8, 10], "L2": [0, 10]},
"1.2 Ο0.5": {"total": [4, 20], "L1": [4, 10], "L2": [0, 10]},
"1.3 Relative": {"total": [7, 20], "L1": [7, 10], "L2": [0, 10]},
"1.4 RABC low": {"total": [3, 20], "L1": [3, 10], "L2": [0, 10]},
"1.5 RABC high": {"total": [0, 20], "L1": [0, 10], "L2": [0, 10]},
"1.7 Rel+RABC": {"total": [8, 20], "L1": [8, 10], "L2": [0, 10]},
"2.1 HQ": {"total": [8, 20], "L1": [7, 10], "L2": [1, 10]},
"2.2 HQ+RABC+Rel": {"total": [15, 20], "L1": [10, 10], "L2": [5, 10]},
"2.3 HQ+mirror": {"total": [1, 20], "L1": [0, 10], "L2": [1, 10]},
"2.4 HQ chunk45": {"total": [4, 20], "L1": [4, 10], "L2": [0, 10]},
"2.5 HQ+RABC+Relβ
": {"total": [18, 20], "L1": [10, 10], "L2": [8, 10]},
}
# What the HTML files use (percentages) β for cross-checking the round-trip
HTML_RAW_PCT = {
"1.1 Ο0": {"total": 40, "l1": 80, "l2": 0},
"1.2 Ο0.5": {"total": 20, "l1": 40, "l2": 0},
"1.3 Relative": {"total": 35, "l1": 70, "l2": 0},
"1.4 RABC low": {"total": 15, "l1": 30, "l2": 0},
"1.5 RABC high": {"total": 0, "l1": 0, "l2": 0},
"1.7 Rel+RABC": {"total": 40, "l1": 80, "l2": 0},
"2.1 HQ": {"total": 40, "l1": 70, "l2": 10},
"2.2 HQ+RABC+Rel": {"total": 75, "l1": 100, "l2": 50},
"2.3 HQ+mirror": {"total": 5, "l1": 0, "l2": 10},
"2.4 HQ chunk45": {"total": 20, "l1": 40, "l2": 0},
"2.5 HQ+RABC+Relβ
": {"total": 90, "l1": 100, "l2": 80},
}
HTML_N = {"total": 20, "l1": 10, "l2": 10}
GLOBAL_CONFIDENCE = 0.90
N_MAX = 50
SHUFFLE = False
model_names = list(EXPERIMENTS.keys())
def draw_samples_from_beta_posterior(
success_array: np.ndarray,
rng: np.random.Generator,
num_samples: int = 10000,
alpha_prior: float = 1,
beta_prior: float = 1,
) -> np.ndarray:
"""TRI's exact function from their notebook."""
n_trials = len(success_array)
n_successes = np.sum(success_array)
n_failures = n_trials - n_successes
posterior = stats.beta(alpha_prior + n_successes, beta_prior + n_failures)
return posterior.rvs(num_samples, random_state=rng)
# ββ 1. CLD letters ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
for level in ["total", "L1", "L2"]:
print(f"\n{'='*60}")
print(f" CLD β LEVEL: {level}")
print(f"{'='*60}")
success_arrays = []
for name in model_names:
k, n = EXPERIMENTS[name][level]
arr = np.array([True] * k + [False] * (n - k))
success_arrays.append(arr)
cld_dict = compare_success_and_get_cld(
model_names,
success_arrays,
GLOBAL_CONFIDENCE,
N_MAX,
SHUFFLE,
verbose=True,
)
print(f"\nJSON for HTML embed:")
json_obj = {name: cld_dict[name] for name in model_names}
print(json.dumps(json_obj, ensure_ascii=False))
# ββ 2. Verify Beta posteriors ββββββββββββββββββββββββββββββββββββββββββββββββ
print(f"\n\n{'#'*70}")
print(f" POSTERIOR VERIFICATION")
print(f" Prior: Beta(1,1) (uniform). Posterior: Beta(1+k, 1+n-k)")
print(f"{'#'*70}")
level_map = {"total": "total", "L1": "l1", "L2": "l2"}
rng = np.random.default_rng(42)
all_ok = True
for level in ["total", "L1", "L2"]:
html_key = level_map[level]
print(f"\n{'β'*60}")
print(f" Level: {level} (HTML key: '{html_key}', n={HTML_N[html_key]})")
print(f"{'β'*60}")
print(f" {'Experiment':<24s} {'k/n':>6s} {'Ξ±':>4s} {'Ξ²':>4s} {'Mean':>7s} {'90% CI':>16s} {'HTML%βk':>8s} {'Match':>5s}")
for name in model_names:
k, n = EXPERIMENTS[name][level]
alpha_post = 1 + k
beta_post = 1 + (n - k)
dist = stats.beta(alpha_post, beta_post)
mean = dist.mean()
ci_lo, ci_hi = dist.ppf(0.05), dist.ppf(0.95)
# Verify HTML percentage round-trip
html_pct = HTML_RAW_PCT[name][html_key]
html_n = HTML_N[html_key]
html_k = round(html_pct / 100 * html_n)
match = html_k == k
if not match:
all_ok = False
print(
f" {name:<24s} {k:>2d}/{n:<2d} {alpha_post:>4d} {beta_post:>4d} "
f"{mean*100:>6.1f}% [{ci_lo*100:>5.1f}% β {ci_hi*100:>5.1f}%] "
f"{html_pct}%β{html_k:>2d} {'β' if match else 'β MISMATCH'}"
)
# Also run TRI's draw_samples_from_beta_posterior for a spot-check
spot_name = "2.5 HQ+RABC+Relβ
"
k_spot, n_spot = EXPERIMENTS[spot_name][level]
arr_spot = np.array([True] * k_spot + [False] * (n_spot - k_spot))
samples = draw_samples_from_beta_posterior(arr_spot, rng, num_samples=100_000)
print(f"\n Spot-check ({spot_name}, {level}):")
print(f" TRI samples: mean={np.mean(samples)*100:.1f}%, std={np.std(samples)*100:.1f}%")
alpha_s, beta_s = 1 + k_spot, 1 + (n_spot - k_spot)
analytic = stats.beta(alpha_s, beta_s)
print(f" Analytic: mean={analytic.mean()*100:.1f}%, std={analytic.std()*100:.1f}%")
print(f" HTML params: Beta({alpha_s}, {beta_s})")
print(f"\n{'='*60}")
if all_ok:
print(" ALL POSTERIORS VERIFIED β")
print(" - Beta(1+k, 1+n-k) with uniform prior Beta(1,1)")
print(" - HTML percentageβk round-trip: all match")
print(" - Matches TRI draw_samples_from_beta_posterior()")
else:
print(" β SOME MISMATCHES FOUND β see above")
print(f"{'='*60}")
|