robot-folding

Running

App Files Files Community

robot-folding / compute_cld.py

pepijn223 HF Staff

Improve DAgger explainer, add conclusion and expand references

f0f3d44 unverified about 2 months ago

raw

history blame contribute delete

6.23 kB

	"""
	Compute CLD (Compact Letter Display) letters and verify Beta posteriors
	for all experiments using the TRI STEP sequential testing framework.

	Parameters:
	- global_confidence_level = 0.90 (α=0.10)
	- n_max = 50
	- shuffle = False (each rollout is independent)
	- 11 experiments → C(11,2) = 55 pairwise comparisons → Bonferroni correction
	"""

	import json
	import numpy as np
	from scipy import stats
	from sequentialized_barnard_tests.tools.plotting import compare_success_and_get_cld

	EXPERIMENTS = {
	"1.1 π0": {"total": [8, 20], "L1": [8, 10], "L2": [0, 10]},
	"1.2 π0.5": {"total": [4, 20], "L1": [4, 10], "L2": [0, 10]},
	"1.3 Relative": {"total": [7, 20], "L1": [7, 10], "L2": [0, 10]},
	"1.4 RABC low": {"total": [3, 20], "L1": [3, 10], "L2": [0, 10]},
	"1.5 RABC high": {"total": [0, 20], "L1": [0, 10], "L2": [0, 10]},
	"1.7 Rel+RABC": {"total": [8, 20], "L1": [8, 10], "L2": [0, 10]},
	"2.1 HQ": {"total": [8, 20], "L1": [7, 10], "L2": [1, 10]},
	"2.2 HQ+RABC+Rel": {"total": [15, 20], "L1": [10, 10], "L2": [5, 10]},
	"2.3 HQ+mirror": {"total": [1, 20], "L1": [0, 10], "L2": [1, 10]},
	"2.4 HQ chunk45": {"total": [4, 20], "L1": [4, 10], "L2": [0, 10]},
	"2.5 HQ+RABC+Rel★": {"total": [18, 20], "L1": [10, 10], "L2": [8, 10]},
	}

	# What the HTML files use (percentages) — for cross-checking the round-trip
	HTML_RAW_PCT = {
	"1.1 π0": {"total": 40, "l1": 80, "l2": 0},
	"1.2 π0.5": {"total": 20, "l1": 40, "l2": 0},
	"1.3 Relative": {"total": 35, "l1": 70, "l2": 0},
	"1.4 RABC low": {"total": 15, "l1": 30, "l2": 0},
	"1.5 RABC high": {"total": 0, "l1": 0, "l2": 0},
	"1.7 Rel+RABC": {"total": 40, "l1": 80, "l2": 0},
	"2.1 HQ": {"total": 40, "l1": 70, "l2": 10},
	"2.2 HQ+RABC+Rel": {"total": 75, "l1": 100, "l2": 50},
	"2.3 HQ+mirror": {"total": 5, "l1": 0, "l2": 10},
	"2.4 HQ chunk45": {"total": 20, "l1": 40, "l2": 0},
	"2.5 HQ+RABC+Rel★": {"total": 90, "l1": 100, "l2": 80},
	}

	HTML_N = {"total": 20, "l1": 10, "l2": 10}

	GLOBAL_CONFIDENCE = 0.90
	N_MAX = 50
	SHUFFLE = False

	model_names = list(EXPERIMENTS.keys())


	def draw_samples_from_beta_posterior(
	success_array: np.ndarray,
	rng: np.random.Generator,
	num_samples: int = 10000,
	alpha_prior: float = 1,
	beta_prior: float = 1,
	) -> np.ndarray:
	"""TRI's exact function from their notebook."""
	n_trials = len(success_array)
	n_successes = np.sum(success_array)
	n_failures = n_trials - n_successes
	posterior = stats.beta(alpha_prior + n_successes, beta_prior + n_failures)
	return posterior.rvs(num_samples, random_state=rng)


	# ── 1. CLD letters ──────────────────────────────────────────────────────────
	for level in ["total", "L1", "L2"]:
	print(f"\n{'='*60}")
	print(f" CLD — LEVEL: {level}")
	print(f"{'='*60}")

	success_arrays = []
	for name in model_names:
	k, n = EXPERIMENTS[name][level]
	arr = np.array([True] * k + [False] * (n - k))
	success_arrays.append(arr)

	cld_dict = compare_success_and_get_cld(
	model_names,
	success_arrays,
	GLOBAL_CONFIDENCE,
	N_MAX,
	SHUFFLE,
	verbose=True,
	)

	print(f"\nJSON for HTML embed:")
	json_obj = {name: cld_dict[name] for name in model_names}
	print(json.dumps(json_obj, ensure_ascii=False))


	# ── 2. Verify Beta posteriors ────────────────────────────────────────────────
	print(f"\n\n{'#'*70}")
	print(f" POSTERIOR VERIFICATION")
	print(f" Prior: Beta(1,1) (uniform). Posterior: Beta(1+k, 1+n-k)")
	print(f"{'#'*70}")

	level_map = {"total": "total", "L1": "l1", "L2": "l2"}
	rng = np.random.default_rng(42)
	all_ok = True

	for level in ["total", "L1", "L2"]:
	html_key = level_map[level]
	print(f"\n{'─'*60}")
	print(f" Level: {level} (HTML key: '{html_key}', n={HTML_N[html_key]})")
	print(f"{'─'*60}")
	print(f" {'Experiment':<24s} {'k/n':>6s} {'α':>4s} {'β':>4s} {'Mean':>7s} {'90% CI':>16s} {'HTML%→k':>8s} {'Match':>5s}")

	for name in model_names:
	k, n = EXPERIMENTS[name][level]
	alpha_post = 1 + k
	beta_post = 1 + (n - k)
	dist = stats.beta(alpha_post, beta_post)
	mean = dist.mean()
	ci_lo, ci_hi = dist.ppf(0.05), dist.ppf(0.95)

	# Verify HTML percentage round-trip
	html_pct = HTML_RAW_PCT[name][html_key]
	html_n = HTML_N[html_key]
	html_k = round(html_pct / 100 * html_n)
	match = html_k == k

	if not match:
	all_ok = False

	print(
	f" {name:<24s} {k:>2d}/{n:<2d} {alpha_post:>4d} {beta_post:>4d} "
	f"{mean100:>6.1f}% [{ci_lo100:>5.1f}% – {ci_hi*100:>5.1f}%] "
	f"{html_pct}%→{html_k:>2d} {'✓' if match else '✗ MISMATCH'}"
	)

	# Also run TRI's draw_samples_from_beta_posterior for a spot-check
	spot_name = "2.5 HQ+RABC+Rel★"
	k_spot, n_spot = EXPERIMENTS[spot_name][level]
	arr_spot = np.array([True] * k_spot + [False] * (n_spot - k_spot))
	samples = draw_samples_from_beta_posterior(arr_spot, rng, num_samples=100_000)
	print(f"\n Spot-check ({spot_name}, {level}):")
	print(f" TRI samples: mean={np.mean(samples)100:.1f}%, std={np.std(samples)100:.1f}%")
	alpha_s, beta_s = 1 + k_spot, 1 + (n_spot - k_spot)
	analytic = stats.beta(alpha_s, beta_s)
	print(f" Analytic: mean={analytic.mean()100:.1f}%, std={analytic.std()100:.1f}%")
	print(f" HTML params: Beta({alpha_s}, {beta_s})")

	print(f"\n{'='*60}")
	if all_ok:
	print(" ALL POSTERIORS VERIFIED ✓")
	print(" - Beta(1+k, 1+n-k) with uniform prior Beta(1,1)")
	print(" - HTML percentage→k round-trip: all match")
	print(" - Matches TRI draw_samples_from_beta_posterior()")
	else:
	print(" ✗ SOME MISMATCHES FOUND — see above")
	print(f"{'='*60}")