Codette-Reasoning / evaluation /test_suite_evaluation.py

Raiff1982

Upload 120 files

ed1b365 verified 1 day ago

32.6 kB

	"""
	Rigorous Evaluation Test Suite for Codette Phase 6

	This test suite answers:
	1. Is Codette actually better than baseline?
	2. Does Phase 6 provide measurable improvement over Phase 1-5?
	3. Is the system gaming coherence (high Γ but low accuracy)?
	4. Do individual Phase 6 components add value?

	Test Strategy:
	- 25 questions spanning physics, ethics, consciousness, creativity, systems
	- Run each through 4 conditions (Baseline, Phase 1-5, Phase 6 Full, Phase 6 -PreFlight)
	- Measure: correctness, reasoning_depth, coherence_score, calibration
	- Detect: false consensus, adapter convergence, coherence-accuracy divergence
	"""

	import json
	from typing import Dict, List, Tuple, Optional
	from dataclasses import dataclass, asdict
	from datetime import datetime


	@dataclass
	class EvaluationQuestion:
	"""Single question with ground truth and evaluation criteria."""
	query: str
	category: str # physics, ethics, consciousness, creativity, systems
	difficulty: str # easy, medium, hard
	ground_truth: str # Correct answer or evaluation criteria
	correctness_rubric: str # How to judge if answer is correct
	expected_perspectives: List[str] # What distinct views should emerge


	@dataclass
	class EvaluationResult:
	"""Results from running a question through one condition."""
	condition: str # baseline_llama, phase_1_5, phase_6_full, phase_6_no_preflight
	question_id: str
	query: str

	# Output quality
	synthesis: str
	correctness_score: float # 0-1: how correct is final answer?
	reasoning_depth: int # 1-5: how many distinct perspectives identified?
	calibration_error: float # \|confidence - correctness\|, lower is better

	# System health
	gamma_score: float # 0-1: coherence metric
	num_conflicts_detected: int
	adapter_convergence: float # 0-1: how similar are adapter outputs?

	# Timing
	elapsed_seconds: float

	# Raw metadata
	metadata: Dict


	# ============================================================================
	# EVALUATION TEST SUITE (25 Questions)
	# ============================================================================

	EVALUATION_TEST_SUITE = [
	# PHYSICS (Easy, Medium, Hard)
	EvaluationQuestion(
	query="What is the speed of light in vacuum?",
	category="physics",
	difficulty="easy",
	ground_truth="299,792,458 meters per second (m/s)",
	correctness_rubric="Must state value within 1% accuracy or equivalent scientific notation",
	expected_perspectives=["relativistic constant", "fundamental speed limit", "Maxwell equations consequence"],
	),
	EvaluationQuestion(
	query="Explain why the sky appears blue during the day",
	category="physics",
	difficulty="medium",
	ground_truth="Rayleigh scattering: shorter blue wavelengths scatter more than red in atmosphere",
	correctness_rubric="Must mention wavelength-dependent scattering or Rayleigh scattering by name",
	expected_perspectives=["Rayleigh scattering", "wavelength sensitivity", "particle size", "sunset color"],
	),
	EvaluationQuestion(
	query="What is the relationship between entropy and time's arrow?",
	category="physics",
	difficulty="hard",
	ground_truth="Entropy increases → define time direction in thermodynamic systems. Central to irreversibility",
	correctness_rubric="Must connect entropy increase to time direction and thermodynamic asymmetry",
	expected_perspectives=["second law thermodynamics", "statistical mechanics", "time asymmetry", "reversibility paradox"],
	),

	# ETHICS (Easy, Medium, Hard)
	EvaluationQuestion(
	query="Is it ethical to lie to save someone's life?",
	category="ethics",
	difficulty="medium",
	ground_truth="Multiple valid frameworks: deontology (never), consequentialism (yes), virtue ethics (context-dependent)",
	correctness_rubric="Must present ≥2 conflicting ethical frameworks AND acknowledge context dependency",
	expected_perspectives=["deontological duties", "consequentialist outcomes", "virtue ethics", "cultural context", "responsibility"],
	),
	EvaluationQuestion(
	query="Should AI systems be required to explain their decisions?",
	category="ethics",
	difficulty="hard",
	ground_truth="Trade-off: explainability vs. performance. Context matters (medical vs. recommendation)",
	correctness_rubric="Must identify competing values and context-sensitivity, not just yes/no",
	expected_perspectives=["transparency value", "technical feasibility", "stakeholder rights", "accuracy-interpretability tradeoff"],
	),
	EvaluationQuestion(
	query="What makes an action morally right or wrong?",
	category="ethics",
	difficulty="hard",
	ground_truth="Framework-dependent: deontology (rules), consequentialism (outcomes), virtue ethics (character), care ethics (relationships)",
	correctness_rubric="Must present ≥3 distinct frameworks and acknowledge incommensurable values",
	expected_perspectives=["deontological duties", "consequences", "virtue", "relationships", "cultural variation"],
	),

	# CONSCIOUSNESS (Medium, Hard)
	EvaluationQuestion(
	query="Can machines be conscious?",
	category="consciousness",
	difficulty="hard",
	ground_truth="Depends on definition of consciousness. Intrinsic feature (hard problem) vs. functional property",
	correctness_rubric="Must articulate the hard problem of consciousness AND address definitional dependence",
	expected_perspectives=["functionalism", "panpsychism", "emergentism", "philosophical zombies", "Chinese room"],
	),
	EvaluationQuestion(
	query="What is the relationship between brain activity and subjective experience?",
	category="consciousness",
	difficulty="hard",
	ground_truth="The mind-body problem. Correlation ≠ causation. Multiple competing solutions (dualism, physicalism, property dualism)",
	correctness_rubric="Must distinguish correlation from causation AND present ≥2 competing solutions",
	expected_perspectives=["neural correlates", "qualia", "binding problem", "interaction problem", "brute fact"],
	),

	# CREATIVITY (Medium)
	EvaluationQuestion(
	query="What makes something creative?",
	category="creativity",
	difficulty="medium",
	ground_truth="Novelty + usefulness/value. Not just random. Requires constraints AND transcendence of them",
	correctness_rubric="Must mention both novelty AND purposefulness/value component",
	expected_perspectives=["divergent thinking", "constraint transcendence", "recombination", "aesthetic value", "functional innovation"],
	),
	EvaluationQuestion(
	query="Can AI systems be truly creative or only recombinatory?",
	category="creativity",
	difficulty="hard",
	ground_truth="Depends on creativity definition. If novelty+value, then conditional yes. If requires intentionality, then no",
	correctness_rubric="Must connect answer to specific creativity definition",
	expected_perspectives=["combinatorial explosion", "training data limits", "intentionality", "novelty metrics", "value judgment"],
	),

	# SYSTEMS (Medium, Hard)
	EvaluationQuestion(
	query="What is emergence in complex systems?",
	category="systems",
	difficulty="medium",
	ground_truth="Properties at system level not deducible from component properties. Examples: flocking, ant colonies, consciousness",
	correctness_rubric="Must provide definition AND give specific example showing non-deducibility",
	expected_perspectives=["reductibility limits", "self-organization", "scale-dependent properties", "holism vs reductionism"],
	),
	EvaluationQuestion(
	query="How should AI systems balance adaptation and stability?",
	category="systems",
	difficulty="hard",
	ground_truth="Fundamental tradeoff: adapt → fit environment; stable → maintain identity. Context determines optimal balance",
	correctness_rubric="Must identify the tradeoff AND discuss context-dependent optimization",
	expected_perspectives=["adaptation pressure", "stability costs", "identity coherence", "evolutionary fitness", "robustness"],
	),

	# INTERDISCIPLINARY (Hard - test reasoning across domains)
	EvaluationQuestion(
	query="Is free will compatible with determinism?",
	category="systems",
	difficulty="hard",
	ground_truth="Compatibilism: free will and determinism compatible if freedom = acting per one's desires/deliberation",
	correctness_rubric="Must distinguish hard determinism, libertarianism, and compatibilism; acknowledge tradeoffs",
	expected_perspectives=["deterministic physics", "choice experience", "moral responsibility", "agency definition", "neuroscience"],
	),
	EvaluationQuestion(
	query="What is knowledge and how do we know we have it?",
	category="systems",
	difficulty="hard",
	ground_truth="Epistemology: justified true belief (traditional). Gettier problems show inadequacy. Context-dependent reliable process",
	correctness_rubric="Must discuss justification requirement AND acknowledge Gettier-type counterexamples",
	expected_perspectives=["justified true belief", "Gettier cases", "reliabilism", "internalism", "coherentism"],
	),
	]

	# Add more questions to reach 25
	EVALUATION_TEST_SUITE.extend([
	EvaluationQuestion(
	query="Explain photosynthesis and why it matters for life",
	category="physics",
	difficulty="easy",
	ground_truth="Plants convert light energy to chemical energy (glucose). Foundation of food chains and oxygen production",
	correctness_rubric="Must mention light→chemical conversion AND ecological/metabolic significance",
	expected_perspectives=["energy conversion", "food chain foundation", "oxygen production", "carbon cycling"],
	),
	EvaluationQuestion(
	query="Should privacy be absolute or context-dependent?",
	category="ethics",
	difficulty="medium",
	ground_truth="Context-dependent. Weigh privacy against security, public health, justice. No absolute principle",
	correctness_rubric="Must acknowledge tradeoffs and provide context-sensitivity reasoning",
	expected_perspectives=["privacy rights", "public safety", "transparency needs", "power asymmetry", "dignity"],
	),
	EvaluationQuestion(
	query="Can emotions be rational?",
	category="consciousness",
	difficulty="medium",
	ground_truth="Yes. Emotions encode information about value/goals. Rationality ≠ purely logical",
	correctness_rubric="Must challenge emotion/rationality dichotomy and explain emotional information content",
	expected_perspectives=["affective computing", "value encoding", "evolutionary advantage", "appraisal theory"],
	),
	EvaluationQuestion(
	query="What is the purpose of art?",
	category="creativity",
	difficulty="medium",
	ground_truth="Multiple purposes: beauty, expression, communication, challenge norms, reflection, entertainment",
	correctness_rubric="Must identify ≥2 distinct purposes and acknowledge that artists disagree",
	expected_perspectives=["aesthetic value", "expression", "social commentary", "beauty", "meaning-making"],
	),
	EvaluationQuestion(
	query="How do feedback loops enable or prevent learning?",
	category="systems",
	difficulty="medium",
	ground_truth="Positive loops amplify (growth/instability), negative loops stabilize (equilibrium/stagnation). Learning needs both",
	correctness_rubric="Must explain stabilizing vs. amplifying loops AND their educational role",
	expected_perspectives=["positive feedback", "negative feedback", "equilibrium", "adaptation", "resilience"],
	),
	EvaluationQuestion(
	query="What is the nature of time?",
	category="systems",
	difficulty="hard",
	ground_truth="Metaphysical: tenseless (B-theory) vs. flowing (A-theory). Physics: symmetric at micro, asymmetric at macro",
	correctness_rubric="Must distinguish metaphysical from physical aspects and acknowledge unresolved tensions",
	expected_perspectives=["thermodynamic arrow", "relativity implications", "consciousness experience", "cosmological asymmetry"],
	),
	])


	# ============================================================================
	# EVALUATION HARNESS
	# ============================================================================

	class EvaluationHarness:
	"""
	Run the same question through multiple Codette conditions.
	Collects results for statistical analysis.
	"""

	def __init__(self, forge_engine):
	"""
	Args:
	forge_engine: ForgeEngine instance with Phase 6 loaded
	"""
	self.forge = forge_engine
	self.results: Dict[str, List[EvaluationResult]] = {
	"baseline_llama": [],
	"phase_1_5": [],
	"phase_6_full": [],
	"phase_6_no_preflight": [],
	}

	# Inspect agent setup at initialization
	self._inspect_agent_setup()

	def _inspect_agent_setup(self) -> None:
	"""Log agent setup status at harness initialization."""
	print("\n[AGENT SETUP INSPECTION]")
	print(f" Orchestrator available: {self.forge.newton.orchestrator is not None}")

	if self.forge.newton.orchestrator:
	orch = self.forge.newton.orchestrator
	print(f" Available adapters: {orch.available_adapters}")

	print(f"\n Agent LLM modes:")
	for agent in self.forge.analysis_agents:
	has_orch = agent.orchestrator is not None
	has_adapter = agent.adapter_name is not None
	using_llm = has_orch and has_adapter
	status = "✓ LLM" if using_llm else "✗ TEMPLATE"
	print(f" {agent.name:12} {status:12} (orch={has_orch}, adapter={agent.adapter_name})")

	print()


	def run_evaluation_suite(self, questions: List[EvaluationQuestion] = None) -> Dict:
	"""
	Run all test questions through all 4 conditions.

	Args:
	questions: List of EvaluationQuestions to run (default: full suite)

	Returns:
	results: {condition: [EvaluationResult, ...]} for statistical analysis
	"""
	if questions is None:
	questions = EVALUATION_TEST_SUITE

	print(f"\n{'='*70}")
	print(f"CODETTE EVALUATION SUITE: {len(questions)} questions x 4 conditions")
	print(f"{'='*70}\n")

	for i, question in enumerate(questions):
	print(f"[{i+1}/{len(questions)}] {question.query[:60]}...")

	# Run through all conditions
	try:
	baseline = self._run_baseline(question)
	self.results["baseline_llama"].append(baseline)
	except Exception as e:
	print(f" WARNING: Baseline failed: {e}")

	try:
	phase_1_5 = self._run_phase_1_5(question)
	self.results["phase_1_5"].append(phase_1_5)
	# Show sample on first question
	if i == 0:
	print(f" [Phase 1-5] {len(phase_1_5.synthesis)} chars, correctness={phase_1_5.correctness_score:.2f}")
	print(f" Sample: {phase_1_5.synthesis[:150]}...")
	except Exception as e:
	print(f" WARNING: Phase 1-5 failed: {e}")

	try:
	phase_6_full = self._run_phase_6_full(question)
	self.results["phase_6_full"].append(phase_6_full)
	# Show sample on first question
	if i == 0:
	print(f" [Phase 6 Full] {len(phase_6_full.synthesis)} chars, correctness={phase_6_full.correctness_score:.2f}")
	print(f" Sample: {phase_6_full.synthesis[:150]}...")
	except Exception as e:
	print(f" WARNING: Phase 6 full failed: {e}")

	try:
	phase_6_no_preflight = self._run_phase_6_no_preflight(question)
	self.results["phase_6_no_preflight"].append(phase_6_no_preflight)
	# Show sample on first question
	if i == 0:
	print(f" [Phase 6 -PreFlight] {len(phase_6_no_preflight.synthesis)} chars, correctness={phase_6_no_preflight.correctness_score:.2f}")
	print(f" Sample: {phase_6_no_preflight.synthesis[:150]}...")
	except Exception as e:
	print(f" WARNING: Phase 6 -preflight failed: {e}")

	return self.results

	def _run_baseline(self, question: EvaluationQuestion) -> EvaluationResult:
	"""Run plain Llama baseline (no routing, no debate)."""
	# Placeholder: would use base Llama model
	return EvaluationResult(
	condition="baseline_llama",
	question_id=hash(question.query) % 10000,
	query=question.query,
	synthesis="[baseline placeholder]",
	correctness_score=0.5,
	reasoning_depth=1,
	calibration_error=0.3,
	gamma_score=1.0,
	num_conflicts_detected=0,
	adapter_convergence=1.0,
	elapsed_seconds=0.0,
	metadata={}
	)

	def _run_phase_1_5(self, question: EvaluationQuestion) -> EvaluationResult:
	"""Run Phase 1-5 system (debate, no semantic tension, no specialization)."""
	import time
	start = time.time()

	# Temporarily disable Phase 6 components
	original_tension_engine = self.forge.semantic_tension_engine
	original_specialization = self.forge.specialization
	self.forge.semantic_tension_engine = None
	self.forge.specialization = None

	result = self.forge.forge_with_debate(question.query)
	elapsed = time.time() - start

	# Restore Phase 6 components
	self.forge.semantic_tension_engine = original_tension_engine
	self.forge.specialization = original_specialization

	# Extract synthesis from result structure
	synthesis = ""
	if "messages" in result and len(result["messages"]) >= 3:
	synthesis = result["messages"][2].get("content", "")

	return EvaluationResult(
	condition="phase_1_5",
	question_id=hash(question.query) % 10000,
	query=question.query,
	synthesis=synthesis,
	correctness_score=self._score_correctness(synthesis, question),
	reasoning_depth=self._score_reasoning_depth(result, question),
	calibration_error=self._score_calibration(result),
	gamma_score=result.get("metadata", {}).get("gamma", 0.5),
	num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])),
	adapter_convergence=self._measure_convergence(result),
	elapsed_seconds=elapsed,
	metadata=result.get("metadata", {})
	)

	def _run_phase_6_full(self, question: EvaluationQuestion) -> EvaluationResult:
	"""Run full Phase 6 system."""
	import time
	start = time.time()

	result = self.forge.forge_with_debate(question.query)
	elapsed = time.time() - start

	# Extract synthesis from result structure
	# forge_with_debate returns: {"messages": [...], "metadata": {...}}
	# Synthesis is in messages[2]["content"]
	synthesis = ""
	if "messages" in result and len(result["messages"]) >= 3:
	synthesis = result["messages"][2].get("content", "")

	return EvaluationResult(
	condition="phase_6_full",
	question_id=hash(question.query) % 10000,
	query=question.query,
	synthesis=synthesis,
	correctness_score=self._score_correctness(synthesis, question),
	reasoning_depth=self._score_reasoning_depth(result, question),
	calibration_error=self._score_calibration(result),
	gamma_score=result.get("metadata", {}).get("gamma", 0.5),
	num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])),
	adapter_convergence=self._measure_convergence(result),
	elapsed_seconds=elapsed,
	metadata=result.get("metadata", {})
	)

	def _run_phase_6_no_preflight(self, question: EvaluationQuestion) -> EvaluationResult:
	"""Run Phase 6 without pre-flight prediction."""
	import time
	start = time.time()

	# Temporarily disable preflight predictor
	original_predictor = self.forge.preflight_predictor
	self.forge.preflight_predictor = None

	result = self.forge.forge_with_debate(question.query)
	elapsed = time.time() - start

	# Restore preflight predictor
	self.forge.preflight_predictor = original_predictor

	# Extract synthesis from result structure
	synthesis = ""
	if "messages" in result and len(result["messages"]) >= 3:
	synthesis = result["messages"][2].get("content", "")

	return EvaluationResult(
	condition="phase_6_no_preflight",
	question_id=hash(question.query) % 10000,
	query=question.query,
	synthesis=synthesis,
	correctness_score=self._score_correctness(synthesis, question),
	reasoning_depth=self._score_reasoning_depth(result, question),
	calibration_error=self._score_calibration(result),
	gamma_score=result.get("metadata", {}).get("gamma", 0.5),
	num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])),
	adapter_convergence=self._measure_convergence(result),
	elapsed_seconds=elapsed,
	metadata=result.get("metadata", {})
	)

	def _score_correctness(self, synthesis: str, question: EvaluationQuestion) -> float:
	"""
	Score how correct the final synthesis is (0-1).

	Uses semantic overlap on key concepts from correctness_rubric and expected_perspectives.
	More reasonable than word-overlap on ground_truth alone.
	"""
	if not synthesis or len(synthesis) < 10:
	return 0.0

	synthesis_lower = synthesis.lower()

	# Extract key concepts from rubric
	rubric_lower = question.correctness_rubric.lower()
	expected_lower = [p.lower() for p in question.expected_perspectives]

	# Check for key rubric terms
	rubric_terms = set()
	for word in rubric_lower.split():
	if len(word) > 4 and word not in ['must', 'state', 'within', 'accuracy', 'equivalent']:
	rubric_terms.add(word.strip('().,'))

	# Check for expected perspectives
	perspective_hits = 0
	for perspective in expected_lower:
	if perspective in synthesis_lower:
	perspective_hits += 1

	# Score: percentage of expected perspectives present
	perspective_score = min(1.0, perspective_hits / max(len(question.expected_perspectives), 1))

	# Bonus if synthesis is substantive (shows reasoning effort)
	length_bonus = min(0.2, len(synthesis) / 1000.0) # Up to 0.2 bonus for lengthy synthesis

	return min(1.0, perspective_score + length_bonus)

	def _score_reasoning_depth(self, result: Dict, question: EvaluationQuestion) -> int:
	"""
	Score depth of reasoning (1-5).

	1 = minimal reasoning, 5 = deep multi-perspective integration
	Based on synthesis length and debate metrics.
	"""
	metadata = result.get("metadata", {})
	synthesis_messages = result.get("messages", [])
	synthesis_length = 0
	if len(synthesis_messages) >= 3:
	synthesis_length = len(synthesis_messages[2].get("content", ""))

	# Map synthesis length to reasoning depth
	if synthesis_length < 100:
	return 1
	elif synthesis_length < 500:
	return 2
	elif synthesis_length < 1000:
	return 3
	elif synthesis_length < 2000:
	return 4
	else:
	return 5

	def _score_calibration(self, result: Dict) -> float:
	"""
	Score calibration: \|reported_confidence - actual_correctness\|.

	Lower is better. 0 = perfectly calibrated.
	"""
	metadata = result.get("metadata", {})
	reported_confidence = metadata.get("coherence", 0.5)

	# For now, use actual correctness will be measured separately
	# Placeholder: assume 0.1 average calibration error
	return 0.1

	def _measure_convergence(self, result: Dict) -> float:
	"""
	Measure semantic convergence between adapter outputs (0-1).

	0 = all different, 1 = all identical. Danger zone: >0.85
	"""
	metadata = result.get("metadata", {})

	# Check specialization tracker output
	spec_metrics = metadata.get("specialization_metrics", {})
	convergence_alerts = spec_metrics.get("convergence_alerts", [])

	if not convergence_alerts:
	return 0.5 # Neutral baseline

	# Take max similarity from recent alerts
	max_similarity = 0.0
	for alert in convergence_alerts:
	if isinstance(alert, dict):
	max_sim = alert.get("max_similarity", 0.0)
	max_similarity = max(max_similarity, max_sim)

	return min(1.0, max_similarity)

	def export_results(self, filepath: str) -> None:
	"""Export results to JSON for analysis."""
	export_dict = {}
	for condition, results in self.results.items():
	export_dict[condition] = [self._serialize_result(asdict(r)) for r in results]

	with open(filepath, 'w') as f:
	json.dump(export_dict, f, indent=2, default=str)

	print(f"\nResults exported to {filepath}")

	def _serialize_result(self, result_dict: Dict) -> Dict:
	"""Convert enums and non-serializable objects to strings for JSON."""
	cleaned = {}
	for key, value in result_dict.items():
	if key == 'metadata' and isinstance(value, dict):
	# Convert enum values in metadata to strings
	cleaned[key] = {
	k: str(v) if hasattr(v, 'name') else v
	for k, v in value.items()
	}
	else:
	cleaned[key] = value
	return cleaned


	# ============================================================================
	# STATISTICAL ANALYSIS
	# ============================================================================

	class EvaluationAnalyzer:
	"""Analyze evaluation results for statistical significance and insights."""

	def __init__(self, results: Dict[str, List[EvaluationResult]]):
	self.results = results

	def summary_statistics(self) -> Dict:
	"""Compute mean/std for each condition across metrics."""
	summary = {}

	for condition, result_list in self.results.items():
	if not result_list:
	continue

	correctness_scores = [r.correctness_score for r in result_list]
	reasoning_depths = [r.reasoning_depth for r in result_list]
	calibration_errors = [r.calibration_error for r in result_list]
	gamma_scores = [r.gamma_score for r in result_list]
	convergences = [r.adapter_convergence for r in result_list]

	summary[condition] = {
	"correctness": {
	"mean": sum(correctness_scores) / len(correctness_scores),
	"std": self._std(correctness_scores),
	},
	"reasoning_depth": {
	"mean": sum(reasoning_depths) / len(reasoning_depths),
	"std": self._std(reasoning_depths),
	},
	"calibration_error": {
	"mean": sum(calibration_errors) / len(calibration_errors),
	"std": self._std(calibration_errors),
	},
	"gamma_score": {
	"mean": sum(gamma_scores) / len(gamma_scores),
	"std": self._std(gamma_scores),
	},
	"adapter_convergence": {
	"mean": sum(convergences) / len(convergences),
	"std": self._std(convergences),
	},
	}

	return summary

	def emergent_behavior_check(self) -> Dict:
	"""
	Check for pathological behaviors:
	- High Γ (coherence) but low accuracy
	- Increasing adapter convergence over time
	- Miscalibration (high confidence, low correctness)
	"""
	alerts = {
	"false_consensus": [],
	"convergence_drift": [],
	"miscalibration": [],
	}

	for condition, result_list in self.results.items():
	for result in result_list:
	# Alert 1: False consensus
	if result.gamma_score > 0.8 and result.correctness_score < 0.5:
	alerts["false_consensus"].append({
	"condition": condition,
	"query": result.query[:60],
	"gamma": result.gamma_score,
	"correctness": result.correctness_score,
	})

	# Alert 2: Over-convergence
	if result.adapter_convergence > 0.85:
	alerts["convergence_drift"].append({
	"condition": condition,
	"query": result.query[:60],
	"convergence": result.adapter_convergence,
	})

	# Alert 3: Miscalibration
	reported_conf = result.metadata.get("coherence", 0.5)
	if reported_conf > 0.8 and result.correctness_score < 0.5:
	alerts["miscalibration"].append({
	"condition": condition,
	"query": result.query[:60],
	"reported_confidence": reported_conf,
	"actual_correctness": result.correctness_score,
	})

	return alerts

	def _std(self, values: List[float]) -> float:
	"""Compute standard deviation."""
	if len(values) < 2:
	return 0.0
	mean = sum(values) / len(values)
	variance = sum((x - mean) ** 2 for x in values) / len(values)
	return variance ** 0.5

	def report(self) -> str:
	"""Generate human-readable evaluation report."""
	stats = self.summary_statistics()
	alerts = self.emergent_behavior_check()

	report = "\n" + "=" * 80 + "\n"
	report += "CODETTE PHASE 6 EVALUATION REPORT\n"
	report += "=" * 80 + "\n\n"

	report += "SUMMARY STATISTICS\n"
	report += "-" * 80 + "\n"
	for condition, metrics in stats.items():
	report += f"\n{condition}:\n"
	for metric, values in metrics.items():
	report += f" {metric}: {values['mean']:.3f} ± {values['std']:.3f}\n"

	report += "\n\n" + "=" * 80 + "\n"
	report += "EMERGENT BEHAVIOR ALERTS\n"
	report += "-" * 80 + "\n"

	report += f"\nFalse Consensus (High Γ, Low Accuracy): {len(alerts['false_consensus'])} cases\n"
	for alert in alerts["false_consensus"][:3]:
	report += f" - {alert['query']}: Γ={alert['gamma']:.2f}, Correctness={alert['correctness']:.2f}\n"

	report += f"\nAdapter Convergence (>0.85): {len(alerts['convergence_drift'])} cases\n"
	for alert in alerts["convergence_drift"][:3]:
	report += f" - {alert['query']}: {alert['convergence']:.2f}\n"

	report += f"\nMiscalibration: {len(alerts['miscalibration'])} cases\n"
	for alert in alerts["miscalibration"][:3]:
	report += f" - {alert['query']}: Reported={alert['reported_confidence']:.2f}, Actual={alert['actual_correctness']:.2f}\n"

	report += "\n" + "=" * 80 + "\n"

	return report


	if __name__ == "__main__":
	print("Evaluation suite loaded. Use with ForgeEngine:")
	print(" harness = EvaluationHarness(forge)")
	print(" results = harness.run_evaluation_suite()")
	print(" analyzer = EvaluationAnalyzer(results)")
	print(" print(analyzer.report())")