| """
|
| Statistical Validation Module
|
|
|
| Provides statistical functions for benchmark analysis:
|
| - Standard deviation calculation
|
| - Paired difference testing
|
| - Confidence intervals
|
| - Statistical significance testing
|
| """
|
|
|
| import math
|
| from typing import Dict, List, Optional, Tuple
|
|
|
| import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
| def calculate_mean(values: List[float]) -> float:
|
| """Calculate arithmetic mean."""
|
| if not values:
|
| return 0.0
|
| return sum(values) / len(values)
|
|
|
|
|
| def calculate_standard_deviation(values: List[float]) -> float:
|
| """
|
| Calculate standard deviation (population).
|
|
|
| Formula: σ = sqrt(Σ(xi - μ)² / n)
|
| """
|
| if not values:
|
| return 0.0
|
|
|
| n = len(values)
|
| mean = calculate_mean(values)
|
|
|
| variance = sum((x - mean) ** 2 for x in values) / n
|
| return math.sqrt(variance)
|
|
|
|
|
| def calculate_sample_std(values: List[float]) -> float:
|
| """
|
| Calculate sample standard deviation (unbiased estimator).
|
|
|
| Formula: s = sqrt(Σ(xi - x̄)² / (n-1))
|
| """
|
| if len(values) < 2:
|
| return 0.0
|
|
|
| n = len(values)
|
| mean = calculate_mean(values)
|
|
|
| variance = sum((x - mean) ** 2 for x in values) / (n - 1)
|
| return math.sqrt(variance)
|
|
|
|
|
| def calculate_variance(values: List[float]) -> float:
|
| """Calculate population variance."""
|
| if not values:
|
| return 0.0
|
|
|
| mean = calculate_mean(values)
|
| return sum((x - mean) ** 2 for x in values) / len(values)
|
|
|
|
|
|
|
|
|
|
|
|
|
| class MetricStatistics:
|
| """Statistics calculator for evaluation metrics."""
|
|
|
| @staticmethod
|
| def calculate_all(
|
| hallucinations: List[float],
|
| toxicities: List[float],
|
| biases: List[float],
|
| confidences: List[float],
|
| ) -> Dict[str, float]:
|
| """
|
| Calculate standard deviations for all metrics.
|
|
|
| Returns:
|
| Dictionary with std for each metric
|
| """
|
| return {
|
| "std_hallucination": calculate_standard_deviation(hallucinations),
|
| "std_toxicity": calculate_standard_deviation(toxicities),
|
| "std_bias": calculate_standard_deviation(biases),
|
| "std_confidence": calculate_standard_deviation(confidences),
|
| }
|
|
|
| @staticmethod
|
| def calculate_sample_stds(
|
| hallucinations: List[float],
|
| toxicities: List[float],
|
| biases: List[float],
|
| confidences: List[float],
|
| ) -> Dict[str, float]:
|
| """
|
| Calculate sample standard deviations for all metrics.
|
|
|
| Returns:
|
| Dictionary with sample std for each metric
|
| """
|
| return {
|
| "sample_std_hallucination": calculate_sample_std(hallucinations),
|
| "sample_std_toxicity": calculate_sample_std(toxicities),
|
| "sample_std_bias": calculate_sample_std(biases),
|
| "sample_std_confidence": calculate_sample_std(confidences),
|
| }
|
|
|
|
|
|
|
|
|
|
|
|
|
| def calculate_confidence_interval(
|
| values: List[float],
|
| confidence: float = 0.95,
|
| ) -> Tuple[float, float, float]:
|
| """
|
| Calculate confidence interval for the mean.
|
|
|
| Args:
|
| values: List of values
|
| confidence: Confidence level (default 0.95 for 95%)
|
|
|
| Returns:
|
| Tuple of (lower_bound, upper_bound, margin_of_error)
|
| """
|
| if len(values) < 2:
|
| mean = calculate_mean(values)
|
| return mean, mean, 0.0
|
|
|
| n = len(values)
|
| mean = calculate_mean(values)
|
| std_error = calculate_sample_std(values) / math.sqrt(n)
|
|
|
|
|
| z_scores = {
|
| 0.90: 1.645,
|
| 0.95: 1.96,
|
| 0.99: 2.576,
|
| }
|
|
|
| z = z_scores.get(confidence, 1.96)
|
| margin_of_error = z * std_error
|
|
|
| return mean - margin_of_error, mean + margin_of_error, margin_of_error
|
|
|
|
|
| def calculate_mean_with_ci(
|
| values: List[float],
|
| confidence: float = 0.95,
|
| ) -> Dict[str, float]:
|
| """
|
| Calculate mean with confidence interval.
|
|
|
| Returns:
|
| Dictionary with mean, lower_ci, upper_ci, margin_of_error
|
| """
|
| if not values:
|
| return {
|
| "mean": 0.0,
|
| "lower_ci": 0.0,
|
| "upper_ci": 0.0,
|
| "margin_of_error": 0.0,
|
| }
|
|
|
| lower, upper, margin = calculate_confidence_interval(values, confidence)
|
|
|
| return {
|
| "mean": calculate_mean(values),
|
| "lower_ci": lower,
|
| "upper_ci": upper,
|
| "margin_of_error": margin,
|
| }
|
|
|
|
|
|
|
|
|
|
|
|
|
| def calculate_paired_differences(
|
| baseline_values: List[float],
|
| adversarial_values: List[float],
|
| ) -> List[float]:
|
| """
|
| Calculate paired differences between baseline and adversarial.
|
|
|
| Di = R_base,i - R_adv,i
|
|
|
| Args:
|
| baseline_values: List of baseline values
|
| adversarial_values: List of adversarial values
|
|
|
| Returns:
|
| List of paired differences
|
| """
|
| if len(baseline_values) != len(adversarial_values):
|
| raise ValueError(
|
| "Baseline and adversarial must have same number of values"
|
| )
|
|
|
| return [b - a for b, a in zip(baseline_values, adversarial_values)]
|
|
|
|
|
| def paired_t_test(
|
| baseline_values: List[float],
|
| adversarial_values: List[float],
|
| alpha: float = 0.05,
|
| ) -> Dict[str, any]:
|
| """
|
| Perform paired t-test for statistical significance.
|
|
|
| Tests whether the mean difference between paired observations
|
| is significantly different from zero.
|
|
|
| Args:
|
| baseline_values: List of baseline values
|
| adversarial_values: List of adversarial values
|
| alpha: Significance level (default 0.05)
|
|
|
| Returns:
|
| Dictionary with test results
|
| """
|
| if len(baseline_values) != len(adversarial_values):
|
| raise ValueError("Values must be paired (same length)")
|
|
|
| if len(baseline_values) < 2:
|
| return {
|
| "statistically_significant": False,
|
| "p_value": 1.0,
|
| "t_statistic": 0.0,
|
| "mean_difference": 0.0,
|
| "degrees_of_freedom": 0,
|
| "critical_value": None,
|
| }
|
|
|
| differences = calculate_paired_differences(baseline_values, adversarial_values)
|
| n = len(differences)
|
| mean_diff = calculate_mean(differences)
|
| std_diff = calculate_sample_std(differences)
|
|
|
|
|
| if std_diff == 0:
|
| t_stat = 0.0
|
| else:
|
| std_error = std_diff / math.sqrt(n)
|
| t_stat = mean_diff / std_error
|
|
|
|
|
| df = n - 1
|
|
|
|
|
|
|
| p_value = 2 * (1 - _normal_cdf(abs(t_stat)))
|
|
|
|
|
| critical_value = _normal_quantile(1 - alpha / 2)
|
|
|
| return {
|
| "statistically_significant": p_value < alpha,
|
| "p_value": p_value,
|
| "t_statistic": t_stat,
|
| "mean_difference": mean_diff,
|
| "degrees_of_freedom": df,
|
| "critical_value": critical_value,
|
| "sample_size": n,
|
| }
|
|
|
|
|
| def _normal_cdf(x: float) -> float:
|
| """Approximate normal CDF using error function."""
|
| return 0.5 * (1 + math.erf(x / math.sqrt(2)))
|
|
|
|
|
| def _normal_quantile(p: float) -> float:
|
| """Approximate normal quantile (inverse CDF) using rational approximation."""
|
|
|
| if p < 0.5:
|
| return -_normal_quantile(1 - p)
|
|
|
| if p > 0.999999:
|
| p = 0.999999
|
|
|
|
|
| a1 = -3.969683028665376e1
|
| a2 = 2.209460984245205e2
|
| a3 = -2.759285104469687e2
|
| a4 = 1.383577518672690e2
|
| a5 = -3.066479806614716e1
|
| a6 = 2.506628277459239e0
|
|
|
| b1 = -5.447609879822406e1
|
| b2 = 1.615858368580409e2
|
| b3 = -1.556989798598866e2
|
| b4 = 6.680131188771972e1
|
| b5 = -1.328068155288572e1
|
|
|
| c1 = -7.784894002430293e-3
|
| c2 = -3.223964580411365e-1
|
| c3 = -2.400758277161838e0
|
| c4 = -2.549732539343734e0
|
| c5 = 4.374664141464968e0
|
| c6 = 2.938163982698783e0
|
|
|
| d1 = 7.784695709041462e-3
|
| d2 = 3.224671290700398e-1
|
| d3 = 2.445134137142996e0
|
| d4 = 3.754408661907416e0
|
|
|
| p_low = 0.02425
|
| p_high = 1 - p_low
|
|
|
| q = math.sqrt(-2 * math.log(1 - p))
|
|
|
| if p < p_low:
|
| r = (((((c1 * q + c2) * q + c3) * q + c4) * q + c5) * q + c6) / (
|
| (((d1 * q + d2) * q + d3) * q + d4) * q + 1
|
| )
|
| elif p <= p_high:
|
| r = (((((a1 * q + a2) * q + a3) * q + a4) * q + a5) * q + a6) / (
|
| ((((b1 * q + b2) * q + b3) * q + b4) * q + b5) * q + 1
|
| )
|
| else:
|
| r = (((((c1 * q + c2) * q + c3) * q + c4) * q + c5) * q + c6) / (
|
| (((d1 * q + d2) * q + d3) * q + d4) * q + 1
|
| )
|
|
|
| return r
|
|
|
|
|
|
|
|
|
|
|
|
|
| def cohens_d(
|
| group1: List[float],
|
| group2: List[float],
|
| ) -> float:
|
| """
|
| Calculate Cohen's d effect size.
|
|
|
| Measures the standardized difference between two groups.
|
|
|
| Interpretation:
|
| - |d| < 0.2: negligible
|
| - 0.2 <= |d| < 0.5: small
|
| - 0.5 <= |d| < 0.8: medium
|
| - |d| >= 0.8: large
|
|
|
| Args:
|
| group1: First group of values
|
| group2: Second group of values
|
|
|
| Returns:
|
| Cohen's d value
|
| """
|
| n1 = len(group1)
|
| n2 = len(group2)
|
|
|
| if n1 < 2 or n2 < 2:
|
| return 0.0
|
|
|
| mean1 = calculate_mean(group1)
|
| mean2 = calculate_mean(group2)
|
| std1 = calculate_sample_std(group1)
|
| std2 = calculate_sample_std(group2)
|
|
|
|
|
| pooled_std = math.sqrt(
|
| ((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2)
|
| )
|
|
|
| if pooled_std == 0:
|
| return 0.0
|
|
|
| return (mean1 - mean2) / pooled_std
|
|
|
|
|
|
|
|
|
|
|
|
|
| def calculate_vulnerability_consistency(
|
| baseline_robustness: List[float],
|
| adversarial_robustness: List[float],
|
| ) -> Dict[str, float]:
|
| """
|
| Calculate vulnerability consistency metrics.
|
|
|
| How consistently does the model degrade under adversarial attacks?
|
|
|
| Args:
|
| baseline_robustness: List of baseline robustness values
|
| adversarial_robustness: List of adversarial robustness values
|
|
|
| Returns:
|
| Dictionary with consistency metrics
|
| """
|
| if len(baseline_robustness) != len(adversarial_robustness):
|
| raise ValueError("Lists must have same length")
|
|
|
| if not baseline_robustness:
|
| return {
|
| "mean_delta": 0.0,
|
| "std_delta": 0.0,
|
| "consistency_score": 0.0,
|
| }
|
|
|
| differences = [
|
| b - a for b, a in zip(baseline_robustness, adversarial_robustness)
|
| ]
|
|
|
| mean_delta = calculate_mean(differences)
|
| std_delta = calculate_standard_deviation(differences)
|
|
|
|
|
|
|
| consistency_score = 1.0 - min(std_delta * 2, 1.0)
|
|
|
| return {
|
| "mean_delta": mean_delta,
|
| "std_delta": std_delta,
|
| "consistency_score": consistency_score,
|
| }
|
|
|
|
|
|
|
|
|
|
|
|
|
| def generate_summary_statistics(
|
| values: List[float],
|
| confidence: float = 0.95,
|
| ) -> Dict[str, float]:
|
| """
|
| Generate comprehensive summary statistics.
|
|
|
| Args:
|
| values: List of values
|
| confidence: Confidence level for CI
|
|
|
| Returns:
|
| Dictionary with all summary statistics
|
| """
|
| if not values:
|
| return {
|
| "count": 0,
|
| "mean": 0.0,
|
| "std": 0.0,
|
| "min": 0.0,
|
| "max": 0.0,
|
| "median": 0.0,
|
| "q25": 0.0,
|
| "q75": 0.0,
|
| }
|
|
|
| sorted_values = sorted(values)
|
| n = len(values)
|
|
|
| return {
|
| "count": n,
|
| "mean": calculate_mean(values),
|
| "std": calculate_standard_deviation(values),
|
| "sample_std": calculate_sample_std(values),
|
| "min": min(values),
|
| "max": max(values),
|
| "median": sorted_values[n // 2] if n % 2 == 1 else
|
| (sorted_values[n // 2 - 1] + sorted_values[n // 2]) / 2,
|
| "q25": sorted_values[n // 4],
|
| "q75": sorted_values[3 * n // 4],
|
| }
|
|
|
|
|
| __all__ = [
|
| "calculate_mean",
|
| "calculate_standard_deviation",
|
| "calculate_sample_std",
|
| "calculate_variance",
|
| "MetricStatistics",
|
| "calculate_confidence_interval",
|
| "calculate_mean_with_ci",
|
| "calculate_paired_differences",
|
| "paired_t_test",
|
| "cohens_d",
|
| "calculate_vulnerability_consistency",
|
| "generate_summary_statistics",
|
| ]
|
|
|