File size: 14,667 Bytes

ed1b365

"""
Phase 6: Benchmarking Suite

Measures Phase 6 improvements:
1. Multi-round debate: Does accuracy improve across rounds?
2. Memory weighting: Does memory-boosted routing reduce error?
3. Semantic tension: Are embeddings better than heuristics?
4. Specialization: Are adapters maintaining domain expertise?

Run with: pytest test_phase6_e2e.py -v
"""

import json
import numpy as np
from typing import Dict, List, Tuple
from datetime import datetime


class Phase6Benchmarks:
    """
    Comprehensive Phase 6 evaluation suite.
    """

    def __init__(self, forge_engine=None):
        """
        Initialize benchmarks.

        Args:
            forge_engine: ForgeEngine instance to test against
        """
        self.forge = forge_engine
        self.results = {
            "timestamp": datetime.now().isoformat(),
            "multi_round_convergence": {},      # Coherence per round
            "memory_weighting_impact": {},      # With vs. without memory
            "semantic_tension_quality": {},     # Embeddings vs heuristics
            "specialization_metrics": {},       # Domain expertise scores
        }

    def benchmark_multi_round_debate(self, queries: List[str], num_rounds: int = 3) -> Dict:
        """
        BENCHMARK 1: Multi-Round Debate Convergence

        Question: Does multi-round debate improve answer quality?

        Hypothesis: As agents debate across rounds:
        - Tensions decrease (convergence)
        - Coherence increases
        - Synthesis accuracy improves

        Measurement:
        - Run each query through N rounds
        - Track coherence_score per round
        - Track resolution_rate per round
        - Compute convergence rate (tension decay)

        Returns:
            {
                "queries_tested": int,
                "rounds_per_query": int,
                "coherence_by_round": {round: [scores...]},
                "convergence_rate": float,
                "improved_queries": int,
            }
        """
        if not self.forge:
            return {"error": "ForgeEngine not available"}

        coherence_by_round = {i: [] for i in range(num_rounds)}
        resolution_by_round = {i: [] for i in range(num_rounds)}
        improved_count = 0

        for query in queries:
            try:
                result = self.forge.forge_with_debate(query, num_rounds=num_rounds)
                metadata = result.get("metadata", {})

                # Extract per-round metrics
                for round_num in range(num_rounds):
                    round_key = f"round_{round_num}"
                    if round_key in metadata:
                        coherence = metadata[round_key].get("coherence", 0.5)
                        resolution = metadata[round_key].get("resolution_rate", 0.5)
                        coherence_by_round[round_num].append(coherence)
                        resolution_by_round[round_num].append(resolution)

                # Check if coherence improved from round 0 to final
                initial_coh = coherence_by_round[0][-1] if coherence_by_round[0] else 0.5
                final_coh = coherence_by_round[num_rounds - 1][-1] if coherence_by_round[num_rounds - 1] else 0.5

                if final_coh > initial_coh:
                    improved_count += 1

            except Exception as e:
                print(f"Error benchmarking query '{query[:50]}...': {e}")

        # Compute statistics
        coherence_means = {
            i: float(np.mean(scores)) if scores else 0.5 for i, scores in coherence_by_round.items()
        }

        convergence_rate = 0.0
        if num_rounds > 1:
            initial = coherence_means.get(0, 0.5)
            final = coherence_means.get(num_rounds - 1, 0.5)
            if initial > 0:
                convergence_rate = (final - initial) / initial  # Positive = improvement

        self.results["multi_round_convergence"] = {
            "queries_tested": len(queries),
            "rounds_per_query": num_rounds,
            "coherence_by_round": {str(k): round(v, 3) for k, v in coherence_means.items()},
            "convergence_rate": round(convergence_rate, 3),
            "improved_queries": improved_count,
            "improvement_percentage": round(100 * improved_count / max(len(queries), 1), 1),
        }

        return self.results["multi_round_convergence"]

    def benchmark_memory_weighting(self, queries: List[str]) -> Dict:
        """
        BENCHMARK 2: Memory Weighting Impact

        Question: Does memory-weighted routing reduce error vs. pure keyword routing?

        Hypothesis: Adapter weights from past experience guide routing better
        than keywords alone.

        Measurement:
        - Run each query WITHOUT memory weighting (baseline)
        - Run each query WITH memory weighting
        - Compare: coherence_score, conflict_resolution_rate, adapter_diversity
        - Compute improvement delta

        Returns:
            {
                "baseline_coherence": float,
                "memory_coherence": float,
                "coherence_improvement": float,
                "memory_helps_percentage": float,
                "avg_resolution_baseline": float,
                "avg_resolution_memory": float,
            }
        """
        if not self.forge:
            return {"error": "ForgeEngine not available"}

        baseline_coherences = []
        memory_coherences = []
        baseline_resolutions = []
        memory_resolutions = []

        for query in queries:
            try:
                # Baseline: without memory weights
                result_baseline = self.forge.forge_with_debate(query, use_memory_weights=False)
                baseline_meta = result_baseline.get("metadata", {})
                baseline_coherences.append(baseline_meta.get("coherence", 0.5))
                baseline_resolutions.append(baseline_meta.get("resolution_rate", 0.5))

                # With memory: weights from past performance
                result_memory = self.forge.forge_with_debate(query, use_memory_weights=True)
                memory_meta = result_memory.get("metadata", {})
                memory_coherences.append(memory_meta.get("coherence", 0.5))
                memory_resolutions.append(memory_meta.get("resolution_rate", 0.5))

            except Exception as e:
                print(f"Error in memory weighting benchmark: {e}")

        # Compute statistics
        baseline_coh = float(np.mean(baseline_coherences)) if baseline_coherences else 0.5
        memory_coh = float(np.mean(memory_coherences)) if memory_coherences else 0.5
        coh_improve = memory_coh - baseline_coh

        baseline_res = float(np.mean(baseline_resolutions)) if baseline_resolutions else 0.5
        memory_res = float(np.mean(memory_resolutions)) if memory_resolutions else 0.5

        # Percentage of queries where memory helped
        improved = sum(1 for b, m in zip(memory_coherences, baseline_coherences) if m > b)
        help_percentage = 100 * improved / max(len(queries), 1)

        self.results["memory_weighting_impact"] = {
            "queries_tested": len(queries),
            "baseline_avg_coherence": round(baseline_coh, 3),
            "memory_avg_coherence": round(memory_coh, 3),
            "coherence_delta": round(coh_improve, 3),
            "memory_helps_percentage": round(help_percentage, 1),
            "baseline_avg_resolution": round(baseline_res, 3),
            "memory_avg_resolution": round(memory_res, 3),
            "resolution_delta": round(memory_res - baseline_res, 3),
        }

        return self.results["memory_weighting_impact"]

    def benchmark_semantic_tension(self, conflict_samples: List[Tuple[str, str, float]] = None) -> Dict:
        """
        BENCHMARK 3: Semantic Tension Quality

        Question: Are embedding-based tensions (ξ_semantic) better than heuristics?

        Hypothesis: Semantic embeddings capture *real* disagreement better than
        discrete opposition scores (0.4/0.7/1.0).

        Measurement:
        - For known conflict pairs (with ground truth tension)
        - Compute heuristic opposition_score
        - Compute semantic_tension (embeddings)
        - Measure correlation with ground truth

        Args:
            conflict_samples: List of (claim_a, claim_b, ground_truth_tension)

        Returns:
            {
                "samples_tested": int,
                "heuristic_correlation": float,
                "semantic_correlation": float,
                "semantic_advantage": float,
            }
        """
        if not self.forge or not self.forge.semantic_tension_engine:
            return {"error": "SemanticTensionEngine not available"}

        if not conflict_samples:
            return {"error": "No conflict samples provided"}

        heuristic_scores = []
        semantic_scores = []
        ground_truths = []

        for claim_a, claim_b, ground_truth in conflict_samples:
            try:
                # Get semantic tension
                semantic_tension = self.forge.semantic_tension_engine.compute_semantic_tension(claim_a, claim_b)
                semantic_scores.append(semantic_tension)

                # Get heuristic opposition (from conflict engine)
                _, heuristic_opposition = self.forge.conflict_engine._classify_conflict(claim_a, claim_b, 0.5)
                heuristic_scores.append(heuristic_opposition)

                ground_truths.append(ground_truth)

            except Exception as e:
                print(f"Error computing tensions: {e}")

        # Compute correlations with ground truth
        if len(heuristic_scores) > 1 and len(ground_truths) > 1:
            heuristic_corr = float(np.corrcoef(heuristic_scores, ground_truths)[0, 1])
            semantic_corr = float(np.corrcoef(semantic_scores, ground_truths)[0, 1])
            advantage = semantic_corr - heuristic_corr
        else:
            heuristic_corr = 0.0
            semantic_corr = 0.0
            advantage = 0.0

        self.results["semantic_tension_quality"] = {
            "samples_tested": len(conflict_samples),
            "heuristic_correlation": round(heuristic_corr, 3),
            "semantic_correlation": round(semantic_corr, 3),
            "semantic_advantage": round(advantage, 3),
            "semantic_better": semantic_corr > heuristic_corr,
        }

        return self.results["semantic_tension_quality"]

    def benchmark_specialization(self) -> Dict:
        """
        BENCHMARK 4: Specialization Tracking

        Question: Are adapters maintaining domain specialization?

        Hypothesis: Spec scores trend positive for expert adapters,
        negative for generalists. Convergence alerts trigger when
        adapter outputs become too similar.

        Returns:
            {
                "adapters_tracked": int,
                "specialist_adapters": list,
                "generalist_adapters": list,
                "convergence_risks": list,
                "health_status": str,
            }
        """
        if not self.forge or not self.forge.specialization:
            return {"error": "SpecializationTracker not available"}

        system_health = self.forge.specialization.get_system_health()
        health_by_adapter = system_health.get("health_by_adapter", {})

        specialists = [a for a, h in health_by_adapter.items() if h.get("recommendation") == "excellent_specialist"]
        generalists = [a for a, h in health_by_adapter.items() if h.get("recommendation") == "good_generalist"]
        convergence_alerts = system_health.get("convergence_alerts", [])

        self.results["specialization_metrics"] = {
            "adapters_tracked": len(health_by_adapter),
            "specialist_adapters": specialists,
            "generalist_adapters": generalists,
            "convergence_risk_count": len(convergence_alerts),
            "health_by_adapter": {a: h.get("recommendation") for a, h in health_by_adapter.items()},
        }

        return self.results["specialization_metrics"]

    def export_results(self, filepath: str = None) -> Dict:
        """
        Export all benchmark results to JSON.

        Args:
            filepath: Where to save results (optional)

        Returns:
            Complete results dict
        """
        if filepath:
            with open(filepath, "w") as f:
                json.dump(self.results, f, indent=2)
            print(f"Benchmark results saved to {filepath}")

        return self.results

    def summary(self) -> str:
        """
        Generate human-readable summary of all benchmarks.

        Returns:
            Formatted summary string
        """
        summary = "PHASE 6 BENCHMARK SUMMARY\n"
        summary += "=" * 60 + "\n"

        # Multi-round convergence
        mr = self.results.get("multi_round_convergence", {})
        if mr:
            summary += f"\n[1] MULTI-ROUND DEBATE CONVERGENCE\n"
            summary += f"    Queries tested: {mr.get('queries_tested', 0)}\n"
            summary += f"    Convergence rate: {mr.get('convergence_rate', 0):.3f}\n"
            summary += f"    Queries improved: {mr.get('improvement_percentage', 0)}%\n"

        # Memory weighting
        mw = self.results.get("memory_weighting_impact", {})
        if mw:
            summary += f"\n[2] MEMORY WEIGHTING IMPACT\n"
            summary += f"    Baseline coherence: {mw.get('baseline_avg_coherence', 0):.3f}\n"
            summary += f"    With memory: {mw.get('memory_avg_coherence', 0):.3f}\n"
            summary += f"    Delta: {mw.get('coherence_delta', 0):.3f}\n"
            summary += f"    Memory helps: {mw.get('memory_helps_percentage', 0)}% of queries\n"

        # Semantic tension
        st = self.results.get("semantic_tension_quality", {})
        if st:
            summary += f"\n[3] SEMANTIC TENSION QUALITY\n"
            summary += f"    Semantic correlation: {st.get('semantic_correlation', 0):.3f}\n"
            summary += f"    Heuristic correlation: {st.get('heuristic_correlation', 0):.3f}\n"
            summary += f"    Semantic advantage: {st.get('semantic_advantage', 0):.3f}\n"

        # Specialization
        sp = self.results.get("specialization_metrics", {})
        if sp:
            summary += f"\n[4] ADAPTER SPECIALIZATION\n"
            summary += f"    Adapters tracked: {sp.get('adapters_tracked', 0)}\n"
            summary += f"    Specialists: {len(sp.get('specialist_adapters', []))}\n"
            summary += f"    Convergence risks: {sp.get('convergence_risk_count', 0)}\n"

        summary += "\n" + "=" * 60 + "\n"
        return summary


__all__ = ["Phase6Benchmarks"]