Codette-Reasoning / evaluation /phase6_benchmarks.py
Raiff1982's picture
Upload 120 files
ed1b365 verified
"""
Phase 6: Benchmarking Suite
Measures Phase 6 improvements:
1. Multi-round debate: Does accuracy improve across rounds?
2. Memory weighting: Does memory-boosted routing reduce error?
3. Semantic tension: Are embeddings better than heuristics?
4. Specialization: Are adapters maintaining domain expertise?
Run with: pytest test_phase6_e2e.py -v
"""
import json
import numpy as np
from typing import Dict, List, Tuple
from datetime import datetime
class Phase6Benchmarks:
"""
Comprehensive Phase 6 evaluation suite.
"""
def __init__(self, forge_engine=None):
"""
Initialize benchmarks.
Args:
forge_engine: ForgeEngine instance to test against
"""
self.forge = forge_engine
self.results = {
"timestamp": datetime.now().isoformat(),
"multi_round_convergence": {}, # Coherence per round
"memory_weighting_impact": {}, # With vs. without memory
"semantic_tension_quality": {}, # Embeddings vs heuristics
"specialization_metrics": {}, # Domain expertise scores
}
def benchmark_multi_round_debate(self, queries: List[str], num_rounds: int = 3) -> Dict:
"""
BENCHMARK 1: Multi-Round Debate Convergence
Question: Does multi-round debate improve answer quality?
Hypothesis: As agents debate across rounds:
- Tensions decrease (convergence)
- Coherence increases
- Synthesis accuracy improves
Measurement:
- Run each query through N rounds
- Track coherence_score per round
- Track resolution_rate per round
- Compute convergence rate (tension decay)
Returns:
{
"queries_tested": int,
"rounds_per_query": int,
"coherence_by_round": {round: [scores...]},
"convergence_rate": float,
"improved_queries": int,
}
"""
if not self.forge:
return {"error": "ForgeEngine not available"}
coherence_by_round = {i: [] for i in range(num_rounds)}
resolution_by_round = {i: [] for i in range(num_rounds)}
improved_count = 0
for query in queries:
try:
result = self.forge.forge_with_debate(query, num_rounds=num_rounds)
metadata = result.get("metadata", {})
# Extract per-round metrics
for round_num in range(num_rounds):
round_key = f"round_{round_num}"
if round_key in metadata:
coherence = metadata[round_key].get("coherence", 0.5)
resolution = metadata[round_key].get("resolution_rate", 0.5)
coherence_by_round[round_num].append(coherence)
resolution_by_round[round_num].append(resolution)
# Check if coherence improved from round 0 to final
initial_coh = coherence_by_round[0][-1] if coherence_by_round[0] else 0.5
final_coh = coherence_by_round[num_rounds - 1][-1] if coherence_by_round[num_rounds - 1] else 0.5
if final_coh > initial_coh:
improved_count += 1
except Exception as e:
print(f"Error benchmarking query '{query[:50]}...': {e}")
# Compute statistics
coherence_means = {
i: float(np.mean(scores)) if scores else 0.5 for i, scores in coherence_by_round.items()
}
convergence_rate = 0.0
if num_rounds > 1:
initial = coherence_means.get(0, 0.5)
final = coherence_means.get(num_rounds - 1, 0.5)
if initial > 0:
convergence_rate = (final - initial) / initial # Positive = improvement
self.results["multi_round_convergence"] = {
"queries_tested": len(queries),
"rounds_per_query": num_rounds,
"coherence_by_round": {str(k): round(v, 3) for k, v in coherence_means.items()},
"convergence_rate": round(convergence_rate, 3),
"improved_queries": improved_count,
"improvement_percentage": round(100 * improved_count / max(len(queries), 1), 1),
}
return self.results["multi_round_convergence"]
def benchmark_memory_weighting(self, queries: List[str]) -> Dict:
"""
BENCHMARK 2: Memory Weighting Impact
Question: Does memory-weighted routing reduce error vs. pure keyword routing?
Hypothesis: Adapter weights from past experience guide routing better
than keywords alone.
Measurement:
- Run each query WITHOUT memory weighting (baseline)
- Run each query WITH memory weighting
- Compare: coherence_score, conflict_resolution_rate, adapter_diversity
- Compute improvement delta
Returns:
{
"baseline_coherence": float,
"memory_coherence": float,
"coherence_improvement": float,
"memory_helps_percentage": float,
"avg_resolution_baseline": float,
"avg_resolution_memory": float,
}
"""
if not self.forge:
return {"error": "ForgeEngine not available"}
baseline_coherences = []
memory_coherences = []
baseline_resolutions = []
memory_resolutions = []
for query in queries:
try:
# Baseline: without memory weights
result_baseline = self.forge.forge_with_debate(query, use_memory_weights=False)
baseline_meta = result_baseline.get("metadata", {})
baseline_coherences.append(baseline_meta.get("coherence", 0.5))
baseline_resolutions.append(baseline_meta.get("resolution_rate", 0.5))
# With memory: weights from past performance
result_memory = self.forge.forge_with_debate(query, use_memory_weights=True)
memory_meta = result_memory.get("metadata", {})
memory_coherences.append(memory_meta.get("coherence", 0.5))
memory_resolutions.append(memory_meta.get("resolution_rate", 0.5))
except Exception as e:
print(f"Error in memory weighting benchmark: {e}")
# Compute statistics
baseline_coh = float(np.mean(baseline_coherences)) if baseline_coherences else 0.5
memory_coh = float(np.mean(memory_coherences)) if memory_coherences else 0.5
coh_improve = memory_coh - baseline_coh
baseline_res = float(np.mean(baseline_resolutions)) if baseline_resolutions else 0.5
memory_res = float(np.mean(memory_resolutions)) if memory_resolutions else 0.5
# Percentage of queries where memory helped
improved = sum(1 for b, m in zip(memory_coherences, baseline_coherences) if m > b)
help_percentage = 100 * improved / max(len(queries), 1)
self.results["memory_weighting_impact"] = {
"queries_tested": len(queries),
"baseline_avg_coherence": round(baseline_coh, 3),
"memory_avg_coherence": round(memory_coh, 3),
"coherence_delta": round(coh_improve, 3),
"memory_helps_percentage": round(help_percentage, 1),
"baseline_avg_resolution": round(baseline_res, 3),
"memory_avg_resolution": round(memory_res, 3),
"resolution_delta": round(memory_res - baseline_res, 3),
}
return self.results["memory_weighting_impact"]
def benchmark_semantic_tension(self, conflict_samples: List[Tuple[str, str, float]] = None) -> Dict:
"""
BENCHMARK 3: Semantic Tension Quality
Question: Are embedding-based tensions (ξ_semantic) better than heuristics?
Hypothesis: Semantic embeddings capture *real* disagreement better than
discrete opposition scores (0.4/0.7/1.0).
Measurement:
- For known conflict pairs (with ground truth tension)
- Compute heuristic opposition_score
- Compute semantic_tension (embeddings)
- Measure correlation with ground truth
Args:
conflict_samples: List of (claim_a, claim_b, ground_truth_tension)
Returns:
{
"samples_tested": int,
"heuristic_correlation": float,
"semantic_correlation": float,
"semantic_advantage": float,
}
"""
if not self.forge or not self.forge.semantic_tension_engine:
return {"error": "SemanticTensionEngine not available"}
if not conflict_samples:
return {"error": "No conflict samples provided"}
heuristic_scores = []
semantic_scores = []
ground_truths = []
for claim_a, claim_b, ground_truth in conflict_samples:
try:
# Get semantic tension
semantic_tension = self.forge.semantic_tension_engine.compute_semantic_tension(claim_a, claim_b)
semantic_scores.append(semantic_tension)
# Get heuristic opposition (from conflict engine)
_, heuristic_opposition = self.forge.conflict_engine._classify_conflict(claim_a, claim_b, 0.5)
heuristic_scores.append(heuristic_opposition)
ground_truths.append(ground_truth)
except Exception as e:
print(f"Error computing tensions: {e}")
# Compute correlations with ground truth
if len(heuristic_scores) > 1 and len(ground_truths) > 1:
heuristic_corr = float(np.corrcoef(heuristic_scores, ground_truths)[0, 1])
semantic_corr = float(np.corrcoef(semantic_scores, ground_truths)[0, 1])
advantage = semantic_corr - heuristic_corr
else:
heuristic_corr = 0.0
semantic_corr = 0.0
advantage = 0.0
self.results["semantic_tension_quality"] = {
"samples_tested": len(conflict_samples),
"heuristic_correlation": round(heuristic_corr, 3),
"semantic_correlation": round(semantic_corr, 3),
"semantic_advantage": round(advantage, 3),
"semantic_better": semantic_corr > heuristic_corr,
}
return self.results["semantic_tension_quality"]
def benchmark_specialization(self) -> Dict:
"""
BENCHMARK 4: Specialization Tracking
Question: Are adapters maintaining domain specialization?
Hypothesis: Spec scores trend positive for expert adapters,
negative for generalists. Convergence alerts trigger when
adapter outputs become too similar.
Returns:
{
"adapters_tracked": int,
"specialist_adapters": list,
"generalist_adapters": list,
"convergence_risks": list,
"health_status": str,
}
"""
if not self.forge or not self.forge.specialization:
return {"error": "SpecializationTracker not available"}
system_health = self.forge.specialization.get_system_health()
health_by_adapter = system_health.get("health_by_adapter", {})
specialists = [a for a, h in health_by_adapter.items() if h.get("recommendation") == "excellent_specialist"]
generalists = [a for a, h in health_by_adapter.items() if h.get("recommendation") == "good_generalist"]
convergence_alerts = system_health.get("convergence_alerts", [])
self.results["specialization_metrics"] = {
"adapters_tracked": len(health_by_adapter),
"specialist_adapters": specialists,
"generalist_adapters": generalists,
"convergence_risk_count": len(convergence_alerts),
"health_by_adapter": {a: h.get("recommendation") for a, h in health_by_adapter.items()},
}
return self.results["specialization_metrics"]
def export_results(self, filepath: str = None) -> Dict:
"""
Export all benchmark results to JSON.
Args:
filepath: Where to save results (optional)
Returns:
Complete results dict
"""
if filepath:
with open(filepath, "w") as f:
json.dump(self.results, f, indent=2)
print(f"Benchmark results saved to {filepath}")
return self.results
def summary(self) -> str:
"""
Generate human-readable summary of all benchmarks.
Returns:
Formatted summary string
"""
summary = "PHASE 6 BENCHMARK SUMMARY\n"
summary += "=" * 60 + "\n"
# Multi-round convergence
mr = self.results.get("multi_round_convergence", {})
if mr:
summary += f"\n[1] MULTI-ROUND DEBATE CONVERGENCE\n"
summary += f" Queries tested: {mr.get('queries_tested', 0)}\n"
summary += f" Convergence rate: {mr.get('convergence_rate', 0):.3f}\n"
summary += f" Queries improved: {mr.get('improvement_percentage', 0)}%\n"
# Memory weighting
mw = self.results.get("memory_weighting_impact", {})
if mw:
summary += f"\n[2] MEMORY WEIGHTING IMPACT\n"
summary += f" Baseline coherence: {mw.get('baseline_avg_coherence', 0):.3f}\n"
summary += f" With memory: {mw.get('memory_avg_coherence', 0):.3f}\n"
summary += f" Delta: {mw.get('coherence_delta', 0):.3f}\n"
summary += f" Memory helps: {mw.get('memory_helps_percentage', 0)}% of queries\n"
# Semantic tension
st = self.results.get("semantic_tension_quality", {})
if st:
summary += f"\n[3] SEMANTIC TENSION QUALITY\n"
summary += f" Semantic correlation: {st.get('semantic_correlation', 0):.3f}\n"
summary += f" Heuristic correlation: {st.get('heuristic_correlation', 0):.3f}\n"
summary += f" Semantic advantage: {st.get('semantic_advantage', 0):.3f}\n"
# Specialization
sp = self.results.get("specialization_metrics", {})
if sp:
summary += f"\n[4] ADAPTER SPECIALIZATION\n"
summary += f" Adapters tracked: {sp.get('adapters_tracked', 0)}\n"
summary += f" Specialists: {len(sp.get('specialist_adapters', []))}\n"
summary += f" Convergence risks: {sp.get('convergence_risk_count', 0)}\n"
summary += "\n" + "=" * 60 + "\n"
return summary
__all__ = ["Phase6Benchmarks"]