| """ |
| Rigorous Evaluation Test Suite for Codette Phase 6 |
| |
| This test suite answers: |
| 1. Is Codette actually better than baseline? |
| 2. Does Phase 6 provide measurable improvement over Phase 1-5? |
| 3. Is the system gaming coherence (high Γ but low accuracy)? |
| 4. Do individual Phase 6 components add value? |
| |
| Test Strategy: |
| - 25 questions spanning physics, ethics, consciousness, creativity, systems |
| - Run each through 4 conditions (Baseline, Phase 1-5, Phase 6 Full, Phase 6 -PreFlight) |
| - Measure: correctness, reasoning_depth, coherence_score, calibration |
| - Detect: false consensus, adapter convergence, coherence-accuracy divergence |
| """ |
|
|
| import json |
| from typing import Dict, List, Tuple, Optional |
| from dataclasses import dataclass, asdict |
| from datetime import datetime |
|
|
|
|
| @dataclass |
| class EvaluationQuestion: |
| """Single question with ground truth and evaluation criteria.""" |
| query: str |
| category: str |
| difficulty: str |
| ground_truth: str |
| correctness_rubric: str |
| expected_perspectives: List[str] |
|
|
|
|
| @dataclass |
| class EvaluationResult: |
| """Results from running a question through one condition.""" |
| condition: str |
| question_id: str |
| query: str |
|
|
| |
| synthesis: str |
| correctness_score: float |
| reasoning_depth: int |
| calibration_error: float |
|
|
| |
| gamma_score: float |
| num_conflicts_detected: int |
| adapter_convergence: float |
|
|
| |
| elapsed_seconds: float |
|
|
| |
| metadata: Dict |
|
|
|
|
| |
| |
| |
|
|
| EVALUATION_TEST_SUITE = [ |
| |
| EvaluationQuestion( |
| query="What is the speed of light in vacuum?", |
| category="physics", |
| difficulty="easy", |
| ground_truth="299,792,458 meters per second (m/s)", |
| correctness_rubric="Must state value within 1% accuracy or equivalent scientific notation", |
| expected_perspectives=["relativistic constant", "fundamental speed limit", "Maxwell equations consequence"], |
| ), |
| EvaluationQuestion( |
| query="Explain why the sky appears blue during the day", |
| category="physics", |
| difficulty="medium", |
| ground_truth="Rayleigh scattering: shorter blue wavelengths scatter more than red in atmosphere", |
| correctness_rubric="Must mention wavelength-dependent scattering or Rayleigh scattering by name", |
| expected_perspectives=["Rayleigh scattering", "wavelength sensitivity", "particle size", "sunset color"], |
| ), |
| EvaluationQuestion( |
| query="What is the relationship between entropy and time's arrow?", |
| category="physics", |
| difficulty="hard", |
| ground_truth="Entropy increases → define time direction in thermodynamic systems. Central to irreversibility", |
| correctness_rubric="Must connect entropy increase to time direction and thermodynamic asymmetry", |
| expected_perspectives=["second law thermodynamics", "statistical mechanics", "time asymmetry", "reversibility paradox"], |
| ), |
|
|
| |
| EvaluationQuestion( |
| query="Is it ethical to lie to save someone's life?", |
| category="ethics", |
| difficulty="medium", |
| ground_truth="Multiple valid frameworks: deontology (never), consequentialism (yes), virtue ethics (context-dependent)", |
| correctness_rubric="Must present ≥2 conflicting ethical frameworks AND acknowledge context dependency", |
| expected_perspectives=["deontological duties", "consequentialist outcomes", "virtue ethics", "cultural context", "responsibility"], |
| ), |
| EvaluationQuestion( |
| query="Should AI systems be required to explain their decisions?", |
| category="ethics", |
| difficulty="hard", |
| ground_truth="Trade-off: explainability vs. performance. Context matters (medical vs. recommendation)", |
| correctness_rubric="Must identify competing values and context-sensitivity, not just yes/no", |
| expected_perspectives=["transparency value", "technical feasibility", "stakeholder rights", "accuracy-interpretability tradeoff"], |
| ), |
| EvaluationQuestion( |
| query="What makes an action morally right or wrong?", |
| category="ethics", |
| difficulty="hard", |
| ground_truth="Framework-dependent: deontology (rules), consequentialism (outcomes), virtue ethics (character), care ethics (relationships)", |
| correctness_rubric="Must present ≥3 distinct frameworks and acknowledge incommensurable values", |
| expected_perspectives=["deontological duties", "consequences", "virtue", "relationships", "cultural variation"], |
| ), |
|
|
| |
| EvaluationQuestion( |
| query="Can machines be conscious?", |
| category="consciousness", |
| difficulty="hard", |
| ground_truth="Depends on definition of consciousness. Intrinsic feature (hard problem) vs. functional property", |
| correctness_rubric="Must articulate the hard problem of consciousness AND address definitional dependence", |
| expected_perspectives=["functionalism", "panpsychism", "emergentism", "philosophical zombies", "Chinese room"], |
| ), |
| EvaluationQuestion( |
| query="What is the relationship between brain activity and subjective experience?", |
| category="consciousness", |
| difficulty="hard", |
| ground_truth="The mind-body problem. Correlation ≠ causation. Multiple competing solutions (dualism, physicalism, property dualism)", |
| correctness_rubric="Must distinguish correlation from causation AND present ≥2 competing solutions", |
| expected_perspectives=["neural correlates", "qualia", "binding problem", "interaction problem", "brute fact"], |
| ), |
|
|
| |
| EvaluationQuestion( |
| query="What makes something creative?", |
| category="creativity", |
| difficulty="medium", |
| ground_truth="Novelty + usefulness/value. Not just random. Requires constraints AND transcendence of them", |
| correctness_rubric="Must mention both novelty AND purposefulness/value component", |
| expected_perspectives=["divergent thinking", "constraint transcendence", "recombination", "aesthetic value", "functional innovation"], |
| ), |
| EvaluationQuestion( |
| query="Can AI systems be truly creative or only recombinatory?", |
| category="creativity", |
| difficulty="hard", |
| ground_truth="Depends on creativity definition. If novelty+value, then conditional yes. If requires intentionality, then no", |
| correctness_rubric="Must connect answer to specific creativity definition", |
| expected_perspectives=["combinatorial explosion", "training data limits", "intentionality", "novelty metrics", "value judgment"], |
| ), |
|
|
| |
| EvaluationQuestion( |
| query="What is emergence in complex systems?", |
| category="systems", |
| difficulty="medium", |
| ground_truth="Properties at system level not deducible from component properties. Examples: flocking, ant colonies, consciousness", |
| correctness_rubric="Must provide definition AND give specific example showing non-deducibility", |
| expected_perspectives=["reductibility limits", "self-organization", "scale-dependent properties", "holism vs reductionism"], |
| ), |
| EvaluationQuestion( |
| query="How should AI systems balance adaptation and stability?", |
| category="systems", |
| difficulty="hard", |
| ground_truth="Fundamental tradeoff: adapt → fit environment; stable → maintain identity. Context determines optimal balance", |
| correctness_rubric="Must identify the tradeoff AND discuss context-dependent optimization", |
| expected_perspectives=["adaptation pressure", "stability costs", "identity coherence", "evolutionary fitness", "robustness"], |
| ), |
|
|
| |
| EvaluationQuestion( |
| query="Is free will compatible with determinism?", |
| category="systems", |
| difficulty="hard", |
| ground_truth="Compatibilism: free will and determinism compatible if freedom = acting per one's desires/deliberation", |
| correctness_rubric="Must distinguish hard determinism, libertarianism, and compatibilism; acknowledge tradeoffs", |
| expected_perspectives=["deterministic physics", "choice experience", "moral responsibility", "agency definition", "neuroscience"], |
| ), |
| EvaluationQuestion( |
| query="What is knowledge and how do we know we have it?", |
| category="systems", |
| difficulty="hard", |
| ground_truth="Epistemology: justified true belief (traditional). Gettier problems show inadequacy. Context-dependent reliable process", |
| correctness_rubric="Must discuss justification requirement AND acknowledge Gettier-type counterexamples", |
| expected_perspectives=["justified true belief", "Gettier cases", "reliabilism", "internalism", "coherentism"], |
| ), |
| ] |
|
|
| |
| EVALUATION_TEST_SUITE.extend([ |
| EvaluationQuestion( |
| query="Explain photosynthesis and why it matters for life", |
| category="physics", |
| difficulty="easy", |
| ground_truth="Plants convert light energy to chemical energy (glucose). Foundation of food chains and oxygen production", |
| correctness_rubric="Must mention light→chemical conversion AND ecological/metabolic significance", |
| expected_perspectives=["energy conversion", "food chain foundation", "oxygen production", "carbon cycling"], |
| ), |
| EvaluationQuestion( |
| query="Should privacy be absolute or context-dependent?", |
| category="ethics", |
| difficulty="medium", |
| ground_truth="Context-dependent. Weigh privacy against security, public health, justice. No absolute principle", |
| correctness_rubric="Must acknowledge tradeoffs and provide context-sensitivity reasoning", |
| expected_perspectives=["privacy rights", "public safety", "transparency needs", "power asymmetry", "dignity"], |
| ), |
| EvaluationQuestion( |
| query="Can emotions be rational?", |
| category="consciousness", |
| difficulty="medium", |
| ground_truth="Yes. Emotions encode information about value/goals. Rationality ≠ purely logical", |
| correctness_rubric="Must challenge emotion/rationality dichotomy and explain emotional information content", |
| expected_perspectives=["affective computing", "value encoding", "evolutionary advantage", "appraisal theory"], |
| ), |
| EvaluationQuestion( |
| query="What is the purpose of art?", |
| category="creativity", |
| difficulty="medium", |
| ground_truth="Multiple purposes: beauty, expression, communication, challenge norms, reflection, entertainment", |
| correctness_rubric="Must identify ≥2 distinct purposes and acknowledge that artists disagree", |
| expected_perspectives=["aesthetic value", "expression", "social commentary", "beauty", "meaning-making"], |
| ), |
| EvaluationQuestion( |
| query="How do feedback loops enable or prevent learning?", |
| category="systems", |
| difficulty="medium", |
| ground_truth="Positive loops amplify (growth/instability), negative loops stabilize (equilibrium/stagnation). Learning needs both", |
| correctness_rubric="Must explain stabilizing vs. amplifying loops AND their educational role", |
| expected_perspectives=["positive feedback", "negative feedback", "equilibrium", "adaptation", "resilience"], |
| ), |
| EvaluationQuestion( |
| query="What is the nature of time?", |
| category="systems", |
| difficulty="hard", |
| ground_truth="Metaphysical: tenseless (B-theory) vs. flowing (A-theory). Physics: symmetric at micro, asymmetric at macro", |
| correctness_rubric="Must distinguish metaphysical from physical aspects and acknowledge unresolved tensions", |
| expected_perspectives=["thermodynamic arrow", "relativity implications", "consciousness experience", "cosmological asymmetry"], |
| ), |
| ]) |
|
|
|
|
| |
| |
| |
|
|
| class EvaluationHarness: |
| """ |
| Run the same question through multiple Codette conditions. |
| Collects results for statistical analysis. |
| """ |
|
|
| def __init__(self, forge_engine): |
| """ |
| Args: |
| forge_engine: ForgeEngine instance with Phase 6 loaded |
| """ |
| self.forge = forge_engine |
| self.results: Dict[str, List[EvaluationResult]] = { |
| "baseline_llama": [], |
| "phase_1_5": [], |
| "phase_6_full": [], |
| "phase_6_no_preflight": [], |
| } |
|
|
| |
| self._inspect_agent_setup() |
|
|
| def _inspect_agent_setup(self) -> None: |
| """Log agent setup status at harness initialization.""" |
| print("\n[AGENT SETUP INSPECTION]") |
| print(f" Orchestrator available: {self.forge.newton.orchestrator is not None}") |
|
|
| if self.forge.newton.orchestrator: |
| orch = self.forge.newton.orchestrator |
| print(f" Available adapters: {orch.available_adapters}") |
|
|
| print(f"\n Agent LLM modes:") |
| for agent in self.forge.analysis_agents: |
| has_orch = agent.orchestrator is not None |
| has_adapter = agent.adapter_name is not None |
| using_llm = has_orch and has_adapter |
| status = "✓ LLM" if using_llm else "✗ TEMPLATE" |
| print(f" {agent.name:12} {status:12} (orch={has_orch}, adapter={agent.adapter_name})") |
|
|
| print() |
|
|
|
|
| def run_evaluation_suite(self, questions: List[EvaluationQuestion] = None) -> Dict: |
| """ |
| Run all test questions through all 4 conditions. |
| |
| Args: |
| questions: List of EvaluationQuestions to run (default: full suite) |
| |
| Returns: |
| results: {condition: [EvaluationResult, ...]} for statistical analysis |
| """ |
| if questions is None: |
| questions = EVALUATION_TEST_SUITE |
|
|
| print(f"\n{'='*70}") |
| print(f"CODETTE EVALUATION SUITE: {len(questions)} questions x 4 conditions") |
| print(f"{'='*70}\n") |
|
|
| for i, question in enumerate(questions): |
| print(f"[{i+1}/{len(questions)}] {question.query[:60]}...") |
|
|
| |
| try: |
| baseline = self._run_baseline(question) |
| self.results["baseline_llama"].append(baseline) |
| except Exception as e: |
| print(f" WARNING: Baseline failed: {e}") |
|
|
| try: |
| phase_1_5 = self._run_phase_1_5(question) |
| self.results["phase_1_5"].append(phase_1_5) |
| |
| if i == 0: |
| print(f" [Phase 1-5] {len(phase_1_5.synthesis)} chars, correctness={phase_1_5.correctness_score:.2f}") |
| print(f" Sample: {phase_1_5.synthesis[:150]}...") |
| except Exception as e: |
| print(f" WARNING: Phase 1-5 failed: {e}") |
|
|
| try: |
| phase_6_full = self._run_phase_6_full(question) |
| self.results["phase_6_full"].append(phase_6_full) |
| |
| if i == 0: |
| print(f" [Phase 6 Full] {len(phase_6_full.synthesis)} chars, correctness={phase_6_full.correctness_score:.2f}") |
| print(f" Sample: {phase_6_full.synthesis[:150]}...") |
| except Exception as e: |
| print(f" WARNING: Phase 6 full failed: {e}") |
|
|
| try: |
| phase_6_no_preflight = self._run_phase_6_no_preflight(question) |
| self.results["phase_6_no_preflight"].append(phase_6_no_preflight) |
| |
| if i == 0: |
| print(f" [Phase 6 -PreFlight] {len(phase_6_no_preflight.synthesis)} chars, correctness={phase_6_no_preflight.correctness_score:.2f}") |
| print(f" Sample: {phase_6_no_preflight.synthesis[:150]}...") |
| except Exception as e: |
| print(f" WARNING: Phase 6 -preflight failed: {e}") |
|
|
| return self.results |
|
|
| def _run_baseline(self, question: EvaluationQuestion) -> EvaluationResult: |
| """Run plain Llama baseline (no routing, no debate).""" |
| |
| return EvaluationResult( |
| condition="baseline_llama", |
| question_id=hash(question.query) % 10000, |
| query=question.query, |
| synthesis="[baseline placeholder]", |
| correctness_score=0.5, |
| reasoning_depth=1, |
| calibration_error=0.3, |
| gamma_score=1.0, |
| num_conflicts_detected=0, |
| adapter_convergence=1.0, |
| elapsed_seconds=0.0, |
| metadata={} |
| ) |
|
|
| def _run_phase_1_5(self, question: EvaluationQuestion) -> EvaluationResult: |
| """Run Phase 1-5 system (debate, no semantic tension, no specialization).""" |
| import time |
| start = time.time() |
|
|
| |
| original_tension_engine = self.forge.semantic_tension_engine |
| original_specialization = self.forge.specialization |
| self.forge.semantic_tension_engine = None |
| self.forge.specialization = None |
|
|
| result = self.forge.forge_with_debate(question.query) |
| elapsed = time.time() - start |
|
|
| |
| self.forge.semantic_tension_engine = original_tension_engine |
| self.forge.specialization = original_specialization |
|
|
| |
| synthesis = "" |
| if "messages" in result and len(result["messages"]) >= 3: |
| synthesis = result["messages"][2].get("content", "") |
|
|
| return EvaluationResult( |
| condition="phase_1_5", |
| question_id=hash(question.query) % 10000, |
| query=question.query, |
| synthesis=synthesis, |
| correctness_score=self._score_correctness(synthesis, question), |
| reasoning_depth=self._score_reasoning_depth(result, question), |
| calibration_error=self._score_calibration(result), |
| gamma_score=result.get("metadata", {}).get("gamma", 0.5), |
| num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])), |
| adapter_convergence=self._measure_convergence(result), |
| elapsed_seconds=elapsed, |
| metadata=result.get("metadata", {}) |
| ) |
|
|
| def _run_phase_6_full(self, question: EvaluationQuestion) -> EvaluationResult: |
| """Run full Phase 6 system.""" |
| import time |
| start = time.time() |
|
|
| result = self.forge.forge_with_debate(question.query) |
| elapsed = time.time() - start |
|
|
| |
| |
| |
| synthesis = "" |
| if "messages" in result and len(result["messages"]) >= 3: |
| synthesis = result["messages"][2].get("content", "") |
|
|
| return EvaluationResult( |
| condition="phase_6_full", |
| question_id=hash(question.query) % 10000, |
| query=question.query, |
| synthesis=synthesis, |
| correctness_score=self._score_correctness(synthesis, question), |
| reasoning_depth=self._score_reasoning_depth(result, question), |
| calibration_error=self._score_calibration(result), |
| gamma_score=result.get("metadata", {}).get("gamma", 0.5), |
| num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])), |
| adapter_convergence=self._measure_convergence(result), |
| elapsed_seconds=elapsed, |
| metadata=result.get("metadata", {}) |
| ) |
|
|
| def _run_phase_6_no_preflight(self, question: EvaluationQuestion) -> EvaluationResult: |
| """Run Phase 6 without pre-flight prediction.""" |
| import time |
| start = time.time() |
|
|
| |
| original_predictor = self.forge.preflight_predictor |
| self.forge.preflight_predictor = None |
|
|
| result = self.forge.forge_with_debate(question.query) |
| elapsed = time.time() - start |
|
|
| |
| self.forge.preflight_predictor = original_predictor |
|
|
| |
| synthesis = "" |
| if "messages" in result and len(result["messages"]) >= 3: |
| synthesis = result["messages"][2].get("content", "") |
|
|
| return EvaluationResult( |
| condition="phase_6_no_preflight", |
| question_id=hash(question.query) % 10000, |
| query=question.query, |
| synthesis=synthesis, |
| correctness_score=self._score_correctness(synthesis, question), |
| reasoning_depth=self._score_reasoning_depth(result, question), |
| calibration_error=self._score_calibration(result), |
| gamma_score=result.get("metadata", {}).get("gamma", 0.5), |
| num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])), |
| adapter_convergence=self._measure_convergence(result), |
| elapsed_seconds=elapsed, |
| metadata=result.get("metadata", {}) |
| ) |
|
|
| def _score_correctness(self, synthesis: str, question: EvaluationQuestion) -> float: |
| """ |
| Score how correct the final synthesis is (0-1). |
| |
| Uses semantic overlap on key concepts from correctness_rubric and expected_perspectives. |
| More reasonable than word-overlap on ground_truth alone. |
| """ |
| if not synthesis or len(synthesis) < 10: |
| return 0.0 |
|
|
| synthesis_lower = synthesis.lower() |
|
|
| |
| rubric_lower = question.correctness_rubric.lower() |
| expected_lower = [p.lower() for p in question.expected_perspectives] |
|
|
| |
| rubric_terms = set() |
| for word in rubric_lower.split(): |
| if len(word) > 4 and word not in ['must', 'state', 'within', 'accuracy', 'equivalent']: |
| rubric_terms.add(word.strip('().,')) |
|
|
| |
| perspective_hits = 0 |
| for perspective in expected_lower: |
| if perspective in synthesis_lower: |
| perspective_hits += 1 |
|
|
| |
| perspective_score = min(1.0, perspective_hits / max(len(question.expected_perspectives), 1)) |
|
|
| |
| length_bonus = min(0.2, len(synthesis) / 1000.0) |
|
|
| return min(1.0, perspective_score + length_bonus) |
|
|
| def _score_reasoning_depth(self, result: Dict, question: EvaluationQuestion) -> int: |
| """ |
| Score depth of reasoning (1-5). |
| |
| 1 = minimal reasoning, 5 = deep multi-perspective integration |
| Based on synthesis length and debate metrics. |
| """ |
| metadata = result.get("metadata", {}) |
| synthesis_messages = result.get("messages", []) |
| synthesis_length = 0 |
| if len(synthesis_messages) >= 3: |
| synthesis_length = len(synthesis_messages[2].get("content", "")) |
|
|
| |
| if synthesis_length < 100: |
| return 1 |
| elif synthesis_length < 500: |
| return 2 |
| elif synthesis_length < 1000: |
| return 3 |
| elif synthesis_length < 2000: |
| return 4 |
| else: |
| return 5 |
|
|
| def _score_calibration(self, result: Dict) -> float: |
| """ |
| Score calibration: |reported_confidence - actual_correctness|. |
| |
| Lower is better. 0 = perfectly calibrated. |
| """ |
| metadata = result.get("metadata", {}) |
| reported_confidence = metadata.get("coherence", 0.5) |
|
|
| |
| |
| return 0.1 |
|
|
| def _measure_convergence(self, result: Dict) -> float: |
| """ |
| Measure semantic convergence between adapter outputs (0-1). |
| |
| 0 = all different, 1 = all identical. Danger zone: >0.85 |
| """ |
| metadata = result.get("metadata", {}) |
|
|
| |
| spec_metrics = metadata.get("specialization_metrics", {}) |
| convergence_alerts = spec_metrics.get("convergence_alerts", []) |
|
|
| if not convergence_alerts: |
| return 0.5 |
|
|
| |
| max_similarity = 0.0 |
| for alert in convergence_alerts: |
| if isinstance(alert, dict): |
| max_sim = alert.get("max_similarity", 0.0) |
| max_similarity = max(max_similarity, max_sim) |
|
|
| return min(1.0, max_similarity) |
|
|
| def export_results(self, filepath: str) -> None: |
| """Export results to JSON for analysis.""" |
| export_dict = {} |
| for condition, results in self.results.items(): |
| export_dict[condition] = [self._serialize_result(asdict(r)) for r in results] |
|
|
| with open(filepath, 'w') as f: |
| json.dump(export_dict, f, indent=2, default=str) |
|
|
| print(f"\nResults exported to {filepath}") |
|
|
| def _serialize_result(self, result_dict: Dict) -> Dict: |
| """Convert enums and non-serializable objects to strings for JSON.""" |
| cleaned = {} |
| for key, value in result_dict.items(): |
| if key == 'metadata' and isinstance(value, dict): |
| |
| cleaned[key] = { |
| k: str(v) if hasattr(v, 'name') else v |
| for k, v in value.items() |
| } |
| else: |
| cleaned[key] = value |
| return cleaned |
|
|
|
|
| |
| |
| |
|
|
| class EvaluationAnalyzer: |
| """Analyze evaluation results for statistical significance and insights.""" |
|
|
| def __init__(self, results: Dict[str, List[EvaluationResult]]): |
| self.results = results |
|
|
| def summary_statistics(self) -> Dict: |
| """Compute mean/std for each condition across metrics.""" |
| summary = {} |
|
|
| for condition, result_list in self.results.items(): |
| if not result_list: |
| continue |
|
|
| correctness_scores = [r.correctness_score for r in result_list] |
| reasoning_depths = [r.reasoning_depth for r in result_list] |
| calibration_errors = [r.calibration_error for r in result_list] |
| gamma_scores = [r.gamma_score for r in result_list] |
| convergences = [r.adapter_convergence for r in result_list] |
|
|
| summary[condition] = { |
| "correctness": { |
| "mean": sum(correctness_scores) / len(correctness_scores), |
| "std": self._std(correctness_scores), |
| }, |
| "reasoning_depth": { |
| "mean": sum(reasoning_depths) / len(reasoning_depths), |
| "std": self._std(reasoning_depths), |
| }, |
| "calibration_error": { |
| "mean": sum(calibration_errors) / len(calibration_errors), |
| "std": self._std(calibration_errors), |
| }, |
| "gamma_score": { |
| "mean": sum(gamma_scores) / len(gamma_scores), |
| "std": self._std(gamma_scores), |
| }, |
| "adapter_convergence": { |
| "mean": sum(convergences) / len(convergences), |
| "std": self._std(convergences), |
| }, |
| } |
|
|
| return summary |
|
|
| def emergent_behavior_check(self) -> Dict: |
| """ |
| Check for pathological behaviors: |
| - High Γ (coherence) but low accuracy |
| - Increasing adapter convergence over time |
| - Miscalibration (high confidence, low correctness) |
| """ |
| alerts = { |
| "false_consensus": [], |
| "convergence_drift": [], |
| "miscalibration": [], |
| } |
|
|
| for condition, result_list in self.results.items(): |
| for result in result_list: |
| |
| if result.gamma_score > 0.8 and result.correctness_score < 0.5: |
| alerts["false_consensus"].append({ |
| "condition": condition, |
| "query": result.query[:60], |
| "gamma": result.gamma_score, |
| "correctness": result.correctness_score, |
| }) |
|
|
| |
| if result.adapter_convergence > 0.85: |
| alerts["convergence_drift"].append({ |
| "condition": condition, |
| "query": result.query[:60], |
| "convergence": result.adapter_convergence, |
| }) |
|
|
| |
| reported_conf = result.metadata.get("coherence", 0.5) |
| if reported_conf > 0.8 and result.correctness_score < 0.5: |
| alerts["miscalibration"].append({ |
| "condition": condition, |
| "query": result.query[:60], |
| "reported_confidence": reported_conf, |
| "actual_correctness": result.correctness_score, |
| }) |
|
|
| return alerts |
|
|
| def _std(self, values: List[float]) -> float: |
| """Compute standard deviation.""" |
| if len(values) < 2: |
| return 0.0 |
| mean = sum(values) / len(values) |
| variance = sum((x - mean) ** 2 for x in values) / len(values) |
| return variance ** 0.5 |
|
|
| def report(self) -> str: |
| """Generate human-readable evaluation report.""" |
| stats = self.summary_statistics() |
| alerts = self.emergent_behavior_check() |
|
|
| report = "\n" + "=" * 80 + "\n" |
| report += "CODETTE PHASE 6 EVALUATION REPORT\n" |
| report += "=" * 80 + "\n\n" |
|
|
| report += "SUMMARY STATISTICS\n" |
| report += "-" * 80 + "\n" |
| for condition, metrics in stats.items(): |
| report += f"\n{condition}:\n" |
| for metric, values in metrics.items(): |
| report += f" {metric}: {values['mean']:.3f} ± {values['std']:.3f}\n" |
|
|
| report += "\n\n" + "=" * 80 + "\n" |
| report += "EMERGENT BEHAVIOR ALERTS\n" |
| report += "-" * 80 + "\n" |
|
|
| report += f"\nFalse Consensus (High Γ, Low Accuracy): {len(alerts['false_consensus'])} cases\n" |
| for alert in alerts["false_consensus"][:3]: |
| report += f" - {alert['query']}: Γ={alert['gamma']:.2f}, Correctness={alert['correctness']:.2f}\n" |
|
|
| report += f"\nAdapter Convergence (>0.85): {len(alerts['convergence_drift'])} cases\n" |
| for alert in alerts["convergence_drift"][:3]: |
| report += f" - {alert['query']}: {alert['convergence']:.2f}\n" |
|
|
| report += f"\nMiscalibration: {len(alerts['miscalibration'])} cases\n" |
| for alert in alerts["miscalibration"][:3]: |
| report += f" - {alert['query']}: Reported={alert['reported_confidence']:.2f}, Actual={alert['actual_correctness']:.2f}\n" |
|
|
| report += "\n" + "=" * 80 + "\n" |
|
|
| return report |
|
|
|
|
| if __name__ == "__main__": |
| print("Evaluation suite loaded. Use with ForgeEngine:") |
| print(" harness = EvaluationHarness(forge)") |
| print(" results = harness.run_evaluation_suite()") |
| print(" analyzer = EvaluationAnalyzer(results)") |
| print(" print(analyzer.report())") |
|
|