Spaces:

axelsirota
/

ab-test-simulator

Build error

App Files Files Community

ab-test-simulator / app.py

axelsirota

Upload folder using huggingface_hub

d26674d verified 2 months ago

raw

history blame contribute delete

11.6 kB

	"""
	A/B Test Simulator - Understand statistical significance for AI experiments
	Helps PMs calculate sample sizes and interpret A/B test results
	"""

	import gradio as gr
	import numpy as np
	import plotly.graph_objects as go
	from scipy import stats

	np.random.seed(42)


	def calculate_sample_size(baseline_rate, expected_lift, confidence_level, power):
	"""Calculate required sample size for A/B test"""

	# Convert percentages to proportions
	p1 = baseline_rate / 100
	p2 = p1 * (1 + expected_lift / 100)

	# Z-scores for confidence and power
	alpha = 1 - confidence_level / 100
	z_alpha = stats.norm.ppf(1 - alpha / 2)
	z_beta = stats.norm.ppf(power / 100)

	# Pooled proportion
	p_pooled = (p1 + p2) / 2

	# Sample size formula
	numerator = (z_alpha * np.sqrt(2 * p_pooled * (1 - p_pooled)) +
	z_beta * np.sqrt(p1 * (1 - p1) + p2 * (1 - p2))) ** 2
	denominator = (p2 - p1) ** 2

	if denominator == 0:
	return float('inf')

	n = numerator / denominator
	return int(np.ceil(n))


	def simulate_ab_test(baseline_rate, actual_lift, sample_size_per_group, num_simulations=1000):
	"""Simulate A/B test results"""

	p_control = baseline_rate / 100
	p_treatment = p_control * (1 + actual_lift / 100)

	significant_count = 0
	p_values = []
	observed_lifts = []

	for _ in range(num_simulations):
	# Simulate outcomes
	control_successes = np.random.binomial(sample_size_per_group, p_control)
	treatment_successes = np.random.binomial(sample_size_per_group, p_treatment)

	# Calculate observed rates
	control_rate = control_successes / sample_size_per_group
	treatment_rate = treatment_successes / sample_size_per_group

	# Calculate observed lift
	if control_rate > 0:
	observed_lift = (treatment_rate - control_rate) / control_rate * 100
	else:
	observed_lift = 0
	observed_lifts.append(observed_lift)

	# Perform chi-square test
	contingency = [[control_successes, sample_size_per_group - control_successes],
	[treatment_successes, sample_size_per_group - treatment_successes]]
	_, p_value, _, _ = stats.chi2_contingency(contingency)
	p_values.append(p_value)

	if p_value < 0.05:
	significant_count += 1

	return significant_count / num_simulations, p_values, observed_lifts


	def run_analysis(baseline_rate, expected_lift, confidence_level, power, daily_traffic):
	"""Run full A/B test analysis"""

	# Calculate required sample size
	sample_size = calculate_sample_size(baseline_rate, expected_lift, confidence_level, power)

	# Calculate days needed
	samples_per_day = daily_traffic / 2 # Split between A and B
	days_needed = int(np.ceil(sample_size / samples_per_day)) if samples_per_day > 0 else float('inf')

	# Create sample size vs lift chart
	lifts = [2, 5, 10, 15, 20, 25, 30]
	sample_sizes = [calculate_sample_size(baseline_rate, lift, confidence_level, power) for lift in lifts]

	fig1 = go.Figure()
	fig1.add_trace(go.Bar(
	x=[f"{l}%" for l in lifts],
	y=sample_sizes,
	marker_color=['#dc2626' if s > 10000 else '#f59e0b' if s > 2000 else '#22c55e' for s in sample_sizes]
	))
	fig1.add_hline(y=sample_size, line_dash="dash", line_color="#40B8A6",
	annotation_text=f"Your target: {sample_size:,}")
	fig1.update_layout(
	title="Sample Size Required by Expected Lift",
	xaxis_title="Expected Lift",
	yaxis_title="Sample Size per Group",
	height=350
	)

	# Simulate what happens if we run the test
	power_achieved, p_values, observed_lifts = simulate_ab_test(
	baseline_rate, expected_lift, sample_size, num_simulations=500
	)

	# P-value distribution
	fig2 = go.Figure()
	fig2.add_trace(go.Histogram(
	x=p_values,
	nbinsx=30,
	marker_color='#40B8A6'
	))
	fig2.add_vline(x=0.05, line_dash="dash", line_color="#dc2626",
	annotation_text="p=0.05 threshold")
	fig2.update_layout(
	title="Distribution of P-Values (500 Simulations)",
	xaxis_title="P-Value",
	yaxis_title="Count",
	height=300
	)

	# Observed lift distribution
	fig3 = go.Figure()
	fig3.add_trace(go.Histogram(
	x=observed_lifts,
	nbinsx=30,
	marker_color='#40B8A6'
	))
	fig3.add_vline(x=expected_lift, line_dash="dash", line_color="#22c55e",
	annotation_text=f"True lift: {expected_lift}%")
	fig3.update_layout(
	title="Distribution of Observed Lifts",
	xaxis_title="Observed Lift (%)",
	yaxis_title="Count",
	height=300
	)

	# Summary
	summary = f"""## A/B Test Analysis

	### Required Sample Size
	{sample_size:,} samples per group ({sample_size * 2:,} total)

	### Time Estimate
	At {daily_traffic:,} users/day: ~{days_needed} days

	### Configuration
	- Baseline Rate: {baseline_rate}%
	- Expected Lift: {expected_lift}%
	- Confidence Level: {confidence_level}%
	- Statistical Power: {power}%

	### Simulation Results (500 runs)
	- Power Achieved: {power_achieved*100:.1f}% of tests detected the effect
	- Median Observed Lift: {np.median(observed_lifts):.1f}%
	- Lift Range (95%): {np.percentile(observed_lifts, 2.5):.1f}% to {np.percentile(observed_lifts, 97.5):.1f}%

	### Interpretation
	{"The test is well-powered and should reliably detect this effect size." if power_achieved > 0.75 else "Warning: The test may be underpowered. Consider running longer or targeting a larger effect size."}
	"""

	return summary, fig1, fig2, fig3


	def quick_check(control_rate, treatment_rate, control_n, treatment_n):
	"""Quick significance check for existing results"""

	control_successes = int(control_rate / 100 * control_n)
	treatment_successes = int(treatment_rate / 100 * treatment_n)

	contingency = [[control_successes, control_n - control_successes],
	[treatment_successes, treatment_n - treatment_successes]]

	_, p_value, _, _ = stats.chi2_contingency(contingency)

	observed_lift = (treatment_rate - control_rate) / control_rate * 100 if control_rate > 0 else 0

	if p_value < 0.01:
	result = "Highly Significant (p < 0.01)"
	color = "#22c55e"
	elif p_value < 0.05:
	result = "Statistically Significant (p < 0.05)"
	color = "#40B8A6"
	elif p_value < 0.1:
	result = "Marginally Significant (p < 0.1)"
	color = "#f59e0b"
	else:
	result = "Not Statistically Significant (p >= 0.1)"
	color = "#dc2626"

	summary = f"""## Results

	P-Value: {p_value:.4f}
	Observed Lift: {observed_lift:+.1f}%

	### Conclusion
	{result}

	### What This Means
	- There is a {"less than " + str(int(p_value100)) + "%" if p_value < 0.05 else str(int(p_value100)) + "%"} chance this result occurred by random chance
	- {"You can be confident Treatment is better than Control" if p_value < 0.05 else "You cannot confidently conclude Treatment is better - the difference might be noise"}

	### Recommendation
	{"Consider rolling out Treatment" if p_value < 0.05 and observed_lift > 0 else "Keep Control or run a longer test" if p_value >= 0.05 else "Treatment appears worse than Control"}
	"""

	return summary


	# Build Gradio interface
	with gr.Blocks(title="A/B Test Simulator", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# A/B Test Simulator

	Calculate sample sizes, simulate A/B tests, and check statistical significance.

	For Product Managers: Understand when you have enough data to make a decision,
	and avoid the trap of calling tests too early.
	""")

	gr.Markdown(
	"> PM Decision: A/B testing AI requires larger sample sizes than traditional tests due to output variability. "
	"Plan for statistical significance - calling tests too early leads to false positives."
	)

	with gr.Tab("Plan a Test"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Test Parameters")

	baseline = gr.Slider(
	minimum=1, maximum=50, value=10, step=1,
	label="Baseline Success Rate (%)",
	info="Current conversion/success rate"
	)

	lift = gr.Slider(
	minimum=1, maximum=50, value=10, step=1,
	label="Minimum Lift to Detect (%)",
	info="Smallest improvement worth detecting"
	)

	confidence = gr.Slider(
	minimum=80, maximum=99, value=95, step=1,
	label="Confidence Level (%)",
	info="95% is standard"
	)

	power = gr.Slider(
	minimum=60, maximum=95, value=80, step=5,
	label="Statistical Power (%)",
	info="80% is standard"
	)

	traffic = gr.Slider(
	minimum=100, maximum=100000, value=5000, step=100,
	label="Daily Traffic",
	info="Users per day in the test"
	)

	plan_btn = gr.Button("Calculate & Simulate", variant="primary")

	with gr.Column(scale=2):
	plan_summary = gr.Markdown()
	sample_chart = gr.Plot()
	pvalue_chart = gr.Plot()
	lift_chart = gr.Plot()

	with gr.Tab("Check Existing Results"):
	gr.Markdown("### Enter Your Test Results")

	with gr.Row():
	with gr.Column():
	control_rate_input = gr.Number(
	value=10, label="Control Success Rate (%)"
	)
	control_n_input = gr.Number(
	value=1000, label="Control Sample Size"
	)
	with gr.Column():
	treatment_rate_input = gr.Number(
	value=12, label="Treatment Success Rate (%)"
	)
	treatment_n_input = gr.Number(
	value=1000, label="Treatment Sample Size"
	)

	check_btn = gr.Button("Check Significance", variant="primary")
	check_result = gr.Markdown()

	gr.Markdown("""
	---
	### PM Guide to A/B Testing AI

	Why AI A/B Testing is Different:
	- AI outputs are stochastic (same input can give different outputs)
	- Quality is subjective (harder to measure than click rates)
	- Harmful outputs can hurt users, not just convert poorly

	Rules of Thumb:
	\| Expected Lift \| Typical Sample Size \| Notes \|
	\|--------------\|---------------------\|-------\|
	\| 5% \| 3,000+ per group \| Very hard to detect \|
	\| 10% \| ~800 per group \| Standard test \|
	\| 20% \| ~200 per group \| Easy to detect \|

	Common Mistakes:
	1. Calling the test too early (peeking)
	2. Not accounting for day-of-week effects
	3. Testing on a biased subset of users
	4. Ignoring practical significance (statistically significant but tiny lift)
	""")

	# Event handlers
	plan_btn.click(
	fn=run_analysis,
	inputs=[baseline, lift, confidence, power, traffic],
	outputs=[plan_summary, sample_chart, pvalue_chart, lift_chart]
	)

	check_btn.click(
	fn=quick_check,
	inputs=[control_rate_input, treatment_rate_input, control_n_input, treatment_n_input],
	outputs=[check_result]
	)

	if __name__ == "__main__":
	demo.launch()