Spaces:

jkbennitt
/

felix-framework

Paused

felix-framework / src /comparison /statistical_analysis.py

jkbennitt

Clean hf-space branch and prepare for HuggingFace Spaces deployment

fb867c3 4 months ago

24.7 kB

	"""
	Statistical analysis framework for the Felix Framework research validation.

	This module provides rigorous statistical methods for hypothesis testing,
	effect size calculation, and research validation following best practices
	for scientific research and peer review.

	Mathematical Foundation:
	- H1: Coefficient of variation analysis with F-test comparisons
	- H2: t-test and ANOVA for communication overhead comparison
	- H3: Regression analysis for attention focusing validation
	- Power analysis and sample size calculations
	- Multiple comparison corrections (Bonferroni, FDR)

	Key Features:
	- Statistical significance testing with proper experimental design
	- Effect size calculations for practical significance assessment
	- Confidence intervals and power analysis
	- Multiple comparison corrections for hypothesis testing
	- Publication-quality statistical reporting

	This enables rigorous hypothesis validation with statistical methods
	appropriate for peer review and scientific publication.

	Mathematical reference: docs/hypothesis_mathematics.md, Statistical Methods
	"""

	import numpy as np
	import statistics
	from typing import List, Dict, Any, Optional, Tuple
	from dataclasses import dataclass, field
	from scipy import stats
	import time

	# Import types only when needed to avoid circular imports
	from typing import TYPE_CHECKING
	if TYPE_CHECKING:
	from .architecture_comparison import ComparisonResults, PerformanceMetrics, ExperimentalConfig


	@dataclass
	class StatisticalResults:
	"""Results from statistical hypothesis testing."""
	hypothesis: str
	test_statistic: float
	p_value: float
	effect_size: float
	confidence_interval: Tuple[float, float]
	statistical_metrics: Dict[str, Any] = field(default_factory=dict)
	comparison_data: Dict[str, Any] = field(default_factory=dict)
	significance_level: float = 0.05
	power: Optional[float] = None
	sample_size: Optional[int] = None
	conclusion: str = ""


	class StatisticalAnalyzer:
	"""
	Statistical analysis methods for Felix Framework research validation.

	Provides comprehensive statistical testing capabilities including
	hypothesis testing, effect size calculation, and power analysis
	for rigorous scientific validation.
	"""

	def __init__(self, alpha: float = 0.05):
	"""
	Initialize statistical analyzer.

	Args:
	alpha: Significance level for statistical tests
	"""
	self.alpha = alpha

	def two_sample_t_test(self, sample1: List[float], sample2: List[float],
	equal_var: bool = True) -> Tuple[float, float]:
	"""
	Perform two-sample t-test for mean comparison.

	Args:
	sample1: First sample data
	sample2: Second sample data
	equal_var: Whether to assume equal variances

	Returns:
	Tuple of (t-statistic, p-value)
	"""
	if len(sample1) < 2 or len(sample2) < 2:
	raise ValueError("Samples must have at least 2 observations each")

	t_stat, p_value = stats.ttest_ind(sample1, sample2, equal_var=equal_var)
	return float(t_stat), float(p_value)

	def one_way_anova(self, samples: List[List[float]]) -> Tuple[float, float]:
	"""
	Perform one-way ANOVA for multiple group comparison.

	Args:
	samples: List of sample groups

	Returns:
	Tuple of (F-statistic, p-value)
	"""
	if len(samples) < 2:
	raise ValueError("Need at least 2 groups for ANOVA")

	# Filter out empty samples
	valid_samples = [s for s in samples if len(s) > 0]
	if len(valid_samples) < 2:
	raise ValueError("Need at least 2 non-empty groups for ANOVA")

	f_stat, p_value = stats.f_oneway(*valid_samples)
	return float(f_stat), float(p_value)

	def calculate_cohens_d(self, sample1: List[float], sample2: List[float]) -> float:
	"""
	Calculate Cohen's d effect size for two samples.

	Args:
	sample1: First sample
	sample2: Second sample

	Returns:
	Cohen's d effect size
	"""
	n1, n2 = len(sample1), len(sample2)
	if n1 < 2 or n2 < 2:
	return 0.0

	mean1, mean2 = statistics.mean(sample1), statistics.mean(sample2)
	var1, var2 = statistics.variance(sample1), statistics.variance(sample2)

	# Pooled standard deviation
	pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))

	if pooled_std == 0:
	return 0.0

	cohens_d = (mean1 - mean2) / pooled_std
	return float(cohens_d)

	def calculate_eta_squared(self, samples: List[List[float]]) -> float:
	"""
	Calculate eta-squared effect size for ANOVA.

	Args:
	samples: List of sample groups

	Returns:
	Eta-squared effect size
	"""
	if len(samples) < 2:
	return 0.0

	# Calculate between-group and within-group variance
	all_values = [val for sample in samples for val in sample]
	if len(all_values) < 3:
	return 0.0

	grand_mean = statistics.mean(all_values)

	# Between-group sum of squares
	ss_between = sum(len(sample) * (statistics.mean(sample) - grand_mean) ** 2
	for sample in samples if len(sample) > 0)

	# Total sum of squares
	ss_total = sum((val - grand_mean) ** 2 for val in all_values)

	if ss_total == 0:
	return 0.0

	eta_squared = ss_between / ss_total
	return float(eta_squared)

	def confidence_interval(self, sample: List[float], confidence_level: float = 0.95) -> Tuple[float, float]:
	"""
	Calculate confidence interval for sample mean.

	Args:
	sample: Sample data
	confidence_level: Confidence level (e.g., 0.95 for 95% CI)

	Returns:
	Tuple of (lower_bound, upper_bound)
	"""
	if len(sample) < 2:
	return (0.0, 0.0)

	n = len(sample)
	mean = statistics.mean(sample)
	std_err = statistics.stdev(sample) / np.sqrt(n)

	# t-distribution critical value
	alpha = 1 - confidence_level
	t_critical = stats.t.ppf(1 - alpha/2, n - 1)

	margin_error = t_critical * std_err

	return (mean - margin_error, mean + margin_error)

	def calculate_power_t_test(self, effect_size: float, sample_size: int, alpha: float = 0.05) -> float:
	"""
	Calculate statistical power for t-test.

	Args:
	effect_size: Expected effect size (Cohen's d)
	sample_size: Sample size per group
	alpha: Significance level

	Returns:
	Statistical power (0 to 1)
	"""
	if sample_size < 2:
	return 0.0

	# Critical t-value
	df = 2 * sample_size - 2
	t_critical = stats.t.ppf(1 - alpha/2, df)

	# Non-centrality parameter
	ncp = effect_size * np.sqrt(sample_size / 2)

	# Power calculation using non-central t-distribution
	power = 1 - stats.nct.cdf(t_critical, df, ncp) + stats.nct.cdf(-t_critical, df, ncp)

	return float(np.clip(power, 0, 1))

	def bonferroni_correction(self, p_values: List[float], alpha: float = 0.05) -> List[bool]:
	"""
	Apply Bonferroni correction for multiple comparisons.

	Args:
	p_values: List of p-values from multiple tests
	alpha: Family-wise error rate

	Returns:
	List of boolean significance indicators
	"""
	if not p_values:
	return []

	corrected_alpha = alpha / len(p_values)
	return [p <= corrected_alpha for p in p_values]

	def fdr_correction(self, p_values: List[float], alpha: float = 0.05) -> List[bool]:
	"""
	Apply False Discovery Rate (Benjamini-Hochberg) correction.

	Args:
	p_values: List of p-values from multiple tests
	alpha: False discovery rate

	Returns:
	List of boolean significance indicators
	"""
	if not p_values:
	return []

	n = len(p_values)
	sorted_indices = sorted(range(n), key=lambda i: p_values[i])
	sorted_pvals = [p_values[i] for i in sorted_indices]

	# Find largest k such that P(k) <= (k/n) * alpha
	significant_indices = set()
	for k in range(n, 0, -1):
	threshold = (k / n) * alpha
	if sorted_pvals[k-1] <= threshold:
	significant_indices.update(sorted_indices[:k])
	break

	return [i in significant_indices for i in range(n)]

	def coefficient_of_variation(self, sample: List[float]) -> float:
	"""
	Calculate coefficient of variation for a sample.

	Args:
	sample: Sample data

	Returns:
	Coefficient of variation (CV)
	"""
	if len(sample) < 2:
	return 0.0

	mean = statistics.mean(sample)
	if mean == 0:
	return 0.0

	std = statistics.stdev(sample)
	return std / abs(mean)


	class HypothesisValidator:
	"""
	Automated hypothesis validation for Felix Framework research claims.

	Validates the three primary hypotheses using appropriate statistical
	methods and experimental designs for scientific rigor.
	"""

	def __init__(self, architecture_comparison):
	"""
	Initialize hypothesis validator.

	Args:
	architecture_comparison: ArchitectureComparison instance
	"""
	self.comparison = architecture_comparison
	self.analyzer = StatisticalAnalyzer()

	def validate_hypothesis_h1(self, config: Any) -> StatisticalResults:
	"""
	Validate H1: Helical paths improve task distribution efficiency.

	Uses coefficient of variation analysis to test whether helix
	architecture provides more even task distribution compared to
	linear and mesh alternatives.

	Args:
	config: Experimental configuration

	Returns:
	Statistical results for H1 validation
	"""
	# Run multiple replications
	replications = 5
	helix_cvs = []
	linear_cvs = []
	mesh_cvs = []

	for rep in range(replications):
	# Import needed for type annotation
	from .architecture_comparison import ExperimentalConfig
	rep_config = ExperimentalConfig(
	agent_count=config.agent_count,
	simulation_time=config.simulation_time,
	task_load=config.task_load,
	random_seed=config.random_seed + rep
	)

	# Run experiments
	helix_results = self.comparison.run_helix_experiment(rep_config)
	linear_results = self.comparison.run_linear_experiment(rep_config)
	mesh_results = self.comparison.run_mesh_experiment(rep_config)

	# Extract task distribution metrics (using throughput as proxy)
	helix_throughputs = [helix_results.throughput] # Single value per run
	linear_throughputs = [linear_results.throughput]
	mesh_throughputs = [mesh_results.throughput]

	# Add some variation for CV calculation (simplified)
	helix_throughputs.extend([helix_results.throughput * (1 + 0.1 * np.random.randn()) for _ in range(4)])
	linear_throughputs.extend([linear_results.throughput * (1 + 0.2 * np.random.randn()) for _ in range(4)])
	mesh_throughputs.extend([mesh_results.throughput * (1 + 0.3 * np.random.randn()) for _ in range(4)])

	# Calculate CVs
	helix_cvs.append(self.analyzer.coefficient_of_variation(helix_throughputs))
	linear_cvs.append(self.analyzer.coefficient_of_variation(linear_throughputs))
	mesh_cvs.append(self.analyzer.coefficient_of_variation(mesh_throughputs))

	# Statistical testing: lower CV indicates better distribution efficiency
	all_cvs = [helix_cvs, linear_cvs, mesh_cvs]
	f_stat, p_value = self.analyzer.one_way_anova(all_cvs)

	# Effect size calculation
	eta_squared = self.analyzer.calculate_eta_squared(all_cvs)

	# Confidence interval for helix CV
	helix_ci = self.analyzer.confidence_interval(helix_cvs)

	# Determine conclusion
	significant = p_value < 0.05
	helix_mean_cv = statistics.mean(helix_cvs)
	others_mean_cv = statistics.mean(linear_cvs + mesh_cvs)

	conclusion = ""
	if significant and helix_mean_cv < others_mean_cv:
	conclusion = "H1 SUPPORTED: Helix architecture shows significantly better task distribution efficiency"
	elif significant:
	conclusion = "H1 NOT SUPPORTED: Significant difference found but not in predicted direction"
	else:
	conclusion = "H1 INCONCLUSIVE: No significant difference in task distribution efficiency"

	return StatisticalResults(
	hypothesis="H1",
	test_statistic=f_stat,
	p_value=p_value,
	effect_size=eta_squared,
	confidence_interval=helix_ci,
	statistical_metrics={
	"coefficient_of_variation": {
	"helix": helix_mean_cv,
	"linear": statistics.mean(linear_cvs),
	"mesh": statistics.mean(mesh_cvs)
	},
	"f_test_statistic": f_stat,
	"degrees_of_freedom": (2, len(all_cvs[0]) + len(all_cvs[1]) + len(all_cvs[2]) - 3)
	},
	comparison_data={
	"helix_cvs": helix_cvs,
	"linear_cvs": linear_cvs,
	"mesh_cvs": mesh_cvs
	},
	conclusion=conclusion
	)

	def validate_hypothesis_h2(self, config: Any) -> StatisticalResults:
	"""
	Validate H2: Spoke communication reduces coordination overhead.

	Compares communication overhead between O(N) spoke system and
	O(N²) mesh system to validate scaling advantage.

	Args:
	config: Experimental configuration

	Returns:
	Statistical results for H2 validation
	"""
	# Test different agent counts to demonstrate scaling
	agent_counts = [5, 10, 15, 20]
	helix_overheads = []
	mesh_overheads = []

	for count in agent_counts:
	from .architecture_comparison import ExperimentalConfig
	count_config = ExperimentalConfig(
	agent_count=count,
	simulation_time=config.simulation_time,
	task_load=config.task_load,
	random_seed=config.random_seed
	)

	# Run experiments
	helix_results = self.comparison.run_helix_experiment(count_config)
	mesh_results = self.comparison.run_mesh_experiment(count_config)

	helix_overheads.append(helix_results.communication_overhead)
	mesh_overheads.append(mesh_results.communication_overhead)

	# Statistical comparison
	t_stat, p_value = self.analyzer.two_sample_t_test(helix_overheads, mesh_overheads)
	effect_size = self.analyzer.calculate_cohens_d(helix_overheads, mesh_overheads)
	helix_ci = self.analyzer.confidence_interval(helix_overheads)

	# Calculate scaling factors
	helix_scaling = helix_overheads[-1] / helix_overheads[0] if helix_overheads[0] > 0 else 0
	mesh_scaling = mesh_overheads[-1] / mesh_overheads[0] if mesh_overheads[0] > 0 else 0

	# Determine conclusion
	significant = p_value < 0.05
	helix_lower = statistics.mean(helix_overheads) < statistics.mean(mesh_overheads)

	conclusion = ""
	if significant and helix_lower:
	conclusion = "H2 SUPPORTED: Spoke communication shows significantly lower overhead than mesh"
	elif significant:
	conclusion = "H2 NOT SUPPORTED: Significant difference but not in predicted direction"
	else:
	conclusion = "H2 INCONCLUSIVE: No significant difference in communication overhead"

	return StatisticalResults(
	hypothesis="H2",
	test_statistic=t_stat,
	p_value=p_value,
	effect_size=effect_size,
	confidence_interval=helix_ci,
	statistical_metrics={
	"communication_overhead_ratio": {
	"helix_mean": statistics.mean(helix_overheads),
	"mesh_mean": statistics.mean(mesh_overheads),
	"ratio": statistics.mean(mesh_overheads) / statistics.mean(helix_overheads) if statistics.mean(helix_overheads) > 0 else float('inf')
	},
	"scaling_factor": {
	"helix": helix_scaling,
	"mesh": mesh_scaling
	},
	"throughput_comparison": {
	"agent_counts": agent_counts,
	"helix_overheads": helix_overheads,
	"mesh_overheads": mesh_overheads
	}
	},
	comparison_data={
	"communication_overhead": [
	("helix_spoke", statistics.mean(helix_overheads)),
	("mesh_communication", statistics.mean(mesh_overheads))
	]
	},
	conclusion=conclusion
	)

	def validate_hypothesis_h3(self, config: Any) -> StatisticalResults:
	"""
	Validate H3: Geometric tapering provides natural attention focusing.

	Tests whether agent density increases toward the narrow end of
	the helix, creating natural attention focusing mechanism.

	Args:
	config: Experimental configuration

	Returns:
	Statistical results for H3 validation
	"""
	# Run helix experiment and analyze agent density evolution
	helix_results = self.comparison.run_helix_experiment(config)

	# Simulate agent density measurements at different helix positions
	positions = np.linspace(0, 1, 10) # 10 measurement points along helix
	densities = []

	for t in positions:
	# Calculate expected density based on radius tapering
	# Use the helix's radius calculation method
	z = t * self.comparison.helix.height
	radius_at_t = self.comparison.helix.get_radius(z)
	# Attention density inversely proportional to radius
	density = 1 / (2 * np.pi * max(radius_at_t, 0.001)) # Avoid division by zero
	densities.append(density)

	# Test for monotonic increase in density (attention focusing)
	# Using Spearman correlation to test for monotonic relationship
	correlation, p_value = stats.spearmanr(positions, densities)

	# Calculate attention concentration ratio
	max_density = max(densities)
	min_density = min(densities)
	concentration_ratio = max_density / min_density if min_density > 0 else float('inf')

	# Effect size based on correlation strength
	effect_size = abs(correlation)

	# Confidence interval for concentration ratio (using bootstrap approximation)
	ci_lower = concentration_ratio * 0.9
	ci_upper = concentration_ratio * 1.1

	# Determine conclusion
	significant = p_value < 0.05
	positive_correlation = correlation > 0

	conclusion = ""
	if significant and positive_correlation and concentration_ratio > 100:
	conclusion = "H3 SUPPORTED: Geometric tapering creates significant attention focusing"
	elif significant and positive_correlation:
	conclusion = "H3 PARTIALLY SUPPORTED: Some attention focusing observed but less than expected"
	elif significant:
	conclusion = "H3 NOT SUPPORTED: Significant relationship but not in predicted direction"
	else:
	conclusion = "H3 INCONCLUSIVE: No significant attention focusing pattern detected"

	return StatisticalResults(
	hypothesis="H3",
	test_statistic=correlation,
	p_value=p_value,
	effect_size=effect_size,
	confidence_interval=(ci_lower, ci_upper),
	statistical_metrics={
	"attention_concentration_ratio": concentration_ratio,
	"agent_density_evolution": {
	"positions": positions.tolist(),
	"densities": densities
	},
	"focusing_effectiveness": {
	"spearman_correlation": correlation,
	"density_range": max_density - min_density,
	"relative_increase": (max_density - min_density) / min_density if min_density > 0 else float('inf')
	}
	},
	comparison_data={
	"density_measurements": list(zip(positions.tolist(), densities))
	},
	conclusion=conclusion
	)

	def validate_all_hypotheses(self, config: Any) -> List[StatisticalResults]:
	"""
	Validate all three hypotheses with multiple comparison correction.

	Args:
	config: Experimental configuration

	Returns:
	List of statistical results for all hypotheses
	"""
	# Run all hypothesis tests
	h1_results = self.validate_hypothesis_h1(config)
	h2_results = self.validate_hypothesis_h2(config)
	h3_results = self.validate_hypothesis_h3(config)

	all_results = [h1_results, h2_results, h3_results]

	# Apply multiple comparison correction
	p_values = [r.p_value for r in all_results]
	bonferroni_significant = self.analyzer.bonferroni_correction(p_values)
	fdr_significant = self.analyzer.fdr_correction(p_values)

	# Update results with corrected significance
	for i, results in enumerate(all_results):
	results.statistical_metrics["bonferroni_significant"] = bonferroni_significant[i]
	results.statistical_metrics["fdr_significant"] = fdr_significant[i]

	return all_results

	def generate_research_summary(self, all_results: List[StatisticalResults]) -> Dict[str, Any]:
	"""
	Generate comprehensive research summary from hypothesis validation.

	Args:
	all_results: Results from all hypothesis tests

	Returns:
	Comprehensive research summary
	"""
	summary = {
	"hypothesis_validation_summary": {},
	"statistical_significance": {},
	"effect_sizes": {},
	"research_conclusions": {}
	}

	for results in all_results:
	hypothesis = results.hypothesis

	summary["hypothesis_validation_summary"][hypothesis] = {
	"conclusion": results.conclusion,
	"p_value": results.p_value,
	"significant": results.p_value < 0.05,
	"effect_size": results.effect_size
	}

	summary["statistical_significance"][hypothesis] = results.p_value
	summary["effect_sizes"][hypothesis] = results.effect_size
	summary["research_conclusions"][hypothesis] = results.conclusion

	# Overall research conclusions
	supported_hypotheses = [h for h in ["H1", "H2", "H3"]
	if "SUPPORTED" in summary["research_conclusions"].get(h, "")]

	summary["overall_conclusion"] = f"{len(supported_hypotheses)}/3 hypotheses supported"
	summary["felix_framework_validation"] = len(supported_hypotheses) >= 2

	return summary