Zenith-7b-V1 / tests /evaluation /comparative_eval.py
Zandy-Wandy's picture
Upload Zenith-7B model
8d18b7c verified
"""Comparative Evaluation Between Model Sizes and Architectures"""
import json
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
import torch
from scipy import stats
logger = logging.getLogger(__name__)
@dataclass
class ModelComparison:
"""Results of comparing two models."""
model_a_name: str
model_b_name: str
metrics_a: Dict[str, float]
metrics_b: Dict[str, float]
differences: Dict[str, float] = field(default_factory=dict)
statistical_tests: Dict[str, Dict[str, float]] = field(default_factory=dict)
summary: str = ""
def compute_differences(self):
"""Compute absolute and relative differences."""
self.differences = {}
for key in self.metrics_a:
if key in self.metrics_b:
diff = self.metrics_a[key] - self.metrics_b[key]
rel_diff = diff / (self.metrics_b[key] if self.metrics_b[key] != 0 else 1e-8)
self.differences[key] = {
"absolute": diff,
"relative": rel_diff,
}
def compute_statistical_tests(self, samples_a: List[float], samples_b: List[float]):
"""Run statistical significance tests."""
if len(samples_a) < 2 or len(samples_b) < 2:
return
# T-test
t_stat, t_p = stats.ttest_ind(samples_a, samples_b, equal_var=False)
# Mann-Whitney U test (non-parametric)
u_stat, u_p = stats.mannwhitneyu(samples_a, samples_b, alternative="two-sided")
self.statistical_tests = {
"t_test": {"statistic": float(t_stat), "p_value": float(t_p)},
"mann_whitney_u": {"statistic": float(u_stat), "p_value": float(u_p)},
}
def generate_summary(self) -> str:
"""Generate human-readable summary."""
lines = [
f"Model Comparison: {self.model_a_name} vs {self.model_b_name}",
"=" * 60,
]
for metric, diffs in self.differences.items():
a_val = self.metrics_a[metric]
b_val = self.metrics_b[metric]
abs_diff = diffs["absolute"]
rel_diff = diffs["relative"] * 100
if abs_diff > 0:
better = self.model_a_name if a_val > b_val else self.model_b_name
else:
better = "tie"
lines.append(f"{metric:30s}: {a_val:.4f} vs {b_val:.4f} "
f"(diff: {abs_diff:+.4f}, {rel_diff:+.1f}%) -> {better}")
lines.append("\nStatistical Significance:")
for test_name, results in self.statistical_tests.items():
p_val = results["p_value"]
sig = "significant" if p_val < 0.05 else "not significant"
lines.append(f" {test_name}: p={p_val:.4f} ({sig})")
self.summary = "\n".join(lines)
return self.summary
class ComparativeEvaluator:
"""Evaluate and compare multiple models."""
def __init__(
self,
models: Dict[str, torch.nn.Module],
tokenizers: Dict[str, Any],
benchmark_config: Any,
):
self.models = models
self.tokenizers = tokenizers
self.config = benchmark_config
def compare_models(
self,
model_names: List[str],
benchmark_datasets: Dict[str, Any],
) -> ModelComparison:
"""Compare two models on multiple benchmarks."""
if len(model_names) != 2:
raise ValueError("Can only compare exactly 2 models")
name_a, name_b = model_names
model_a = self.models[name_a]
model_b = self.models[name_b]
tokenizer_a = self.tokenizers[name_a]
tokenizer_b = self.tokenizers[name_b]
# Run benchmarks
metrics_a = self._run_benchmarks(model_a, tokenizer_a, benchmark_datasets)
metrics_b = self._run_benchmarks(model_b, tokenizer_b, benchmark_datasets)
comparison = ModelComparison(
model_a_name=name_a,
model_b_name=name_b,
metrics_a=metrics_a,
metrics_b=metrics_b,
)
comparison.compute_differences()
# Note: statistical tests would require multiple runs/samples
comparison.generate_summary()
return comparison
def _run_benchmarks(
self,
model: torch.nn.Module,
tokenizer: Any,
datasets: Dict[str, Any],
) -> Dict[str, float]:
"""Run all benchmarks on a model."""
from .benchmark import BenchmarkSuite, BenchmarkConfig
config = BenchmarkConfig(
batch_size=self.config.batch_size,
max_seq_length=self.config.max_seq_length,
datasets=list(datasets.keys()),
)
suite = BenchmarkSuite(model, tokenizer, config)
results = suite.run_all_benchmarks()
# Flatten results
flat_metrics = {}
for category, metrics in results["benchmarks"].items():
if isinstance(metrics, dict):
for key, value in metrics.items():
if isinstance(value, (int, float)):
flat_metrics[f"{category}_{key}"] = value
elif isinstance(metrics, (int, float)):
flat_metrics[category] = metrics
return flat_metrics
def generate_comparison_report(
self,
comparisons: List[ModelComparison],
output_path: str,
):
"""Generate comprehensive comparison report."""
report = {
"timestamp": torch.datetime.now().isoformat(),
"comparisons": [],
}
for comp in comparisons:
report["comparisons"].append({
"models": [comp.model_a_name, comp.model_b_name],
"metrics_a": comp.metrics_a,
"metrics_b": comp.metrics_b,
"differences": comp.differences,
"statistical_tests": comp.statistical_tests,
"summary": comp.summary,
})
with open(output_path, 'w') as f:
json.dump(report, f, indent=2)
logger.info(f"Comparison report saved to {output_path}")
def compare_model_sizes(
models: Dict[str, torch.nn.Module],
tokenizers: Dict[str, Any],
config: Any,
output_dir: str,
) -> Dict[str, ModelComparison]:
"""Compare 7B vs 32B vs 70B models."""
comparisons = {}
evaluator = ComparativeEvaluator(models, tokenizers, config)
# Load benchmark datasets
from .eval_datasets import load_human_eval, load_gsm8k, load_truthfulqa
datasets = {
"human_eval": load_human_eval()[:100],
"gsm8k": load_gsm8k()[:100],
"truthfulqa": load_truthfulqa()[:100],
}
# Compare all pairs
model_names = list(models.keys())
for i in range(len(model_names)):
for j in range(i + 1, len(model_names)):
pair = (model_names[i], model_names[j])
logger.info(f"Comparing {pair[0]} vs {pair[1]}")
comparison = evaluator.compare_models(list(pair), datasets)
comparisons[f"{pair[0]}_vs_{pair[1]}"] = comparison
# Save individual comparison
output_path = f"{output_dir}/comparison_{pair[0]}_vs_{pair[1]}.json"
evaluator.generate_comparison_report([comparison], output_path)
return comparisons
def analyze_scaling_laws(comparisons: Dict[str, ModelComparison]) -> Dict[str, Any]:
"""Analyze scaling laws from model comparisons."""
# Extract size vs performance data
sizes = [] # In parameters (B)
perplexities = []
accuracies = []
code_scores = []
# Map model names to sizes (this would come from configs)
size_map = {"zenith-7b": 7, "zenith-32b": 32, "zenith-70b": 70}
for comp_key, comp in comparisons.items():
# For each comparison, extract metrics
for metric, value in comp.metrics_a.items():
if "perplexity" in metric:
model_name = comp.model_a_name
if model_name in size_map:
sizes.append(size_map[model_name])
perplexities.append(value)
elif "accuracy" in metric or "pass@1" in metric:
model_name = comp.model_a_name
if model_name in size_map:
accuracies.append((size_map[model_name], value))
# Compute scaling exponents (power law fit)
if len(sizes) >= 2 and len(perplexities) >= 2:
log_sizes = np.log(sizes)
log_ppl = np.log(perplexities)
slope, intercept, r_value, p_value, std_err = stats.linregress(log_sizes, log_ppl)
scaling_exponent = -slope # Negative because larger models have lower perplexity
else:
scaling_exponent = None
return {
"sizes": sizes,
"perplexities": perplexities,
"accuracies": accuracies,
"scaling_exponent": scaling_exponent,
"r_squared": r_value**2 if scaling_exponent is not None else None,
}