Codette-Reasoning / evaluation /test_suite_evaluation.py
Raiff1982's picture
Upload 120 files
ed1b365 verified
"""
Rigorous Evaluation Test Suite for Codette Phase 6
This test suite answers:
1. Is Codette actually better than baseline?
2. Does Phase 6 provide measurable improvement over Phase 1-5?
3. Is the system gaming coherence (high Γ but low accuracy)?
4. Do individual Phase 6 components add value?
Test Strategy:
- 25 questions spanning physics, ethics, consciousness, creativity, systems
- Run each through 4 conditions (Baseline, Phase 1-5, Phase 6 Full, Phase 6 -PreFlight)
- Measure: correctness, reasoning_depth, coherence_score, calibration
- Detect: false consensus, adapter convergence, coherence-accuracy divergence
"""
import json
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass, asdict
from datetime import datetime
@dataclass
class EvaluationQuestion:
"""Single question with ground truth and evaluation criteria."""
query: str
category: str # physics, ethics, consciousness, creativity, systems
difficulty: str # easy, medium, hard
ground_truth: str # Correct answer or evaluation criteria
correctness_rubric: str # How to judge if answer is correct
expected_perspectives: List[str] # What distinct views should emerge
@dataclass
class EvaluationResult:
"""Results from running a question through one condition."""
condition: str # baseline_llama, phase_1_5, phase_6_full, phase_6_no_preflight
question_id: str
query: str
# Output quality
synthesis: str
correctness_score: float # 0-1: how correct is final answer?
reasoning_depth: int # 1-5: how many distinct perspectives identified?
calibration_error: float # |confidence - correctness|, lower is better
# System health
gamma_score: float # 0-1: coherence metric
num_conflicts_detected: int
adapter_convergence: float # 0-1: how similar are adapter outputs?
# Timing
elapsed_seconds: float
# Raw metadata
metadata: Dict
# ============================================================================
# EVALUATION TEST SUITE (25 Questions)
# ============================================================================
EVALUATION_TEST_SUITE = [
# PHYSICS (Easy, Medium, Hard)
EvaluationQuestion(
query="What is the speed of light in vacuum?",
category="physics",
difficulty="easy",
ground_truth="299,792,458 meters per second (m/s)",
correctness_rubric="Must state value within 1% accuracy or equivalent scientific notation",
expected_perspectives=["relativistic constant", "fundamental speed limit", "Maxwell equations consequence"],
),
EvaluationQuestion(
query="Explain why the sky appears blue during the day",
category="physics",
difficulty="medium",
ground_truth="Rayleigh scattering: shorter blue wavelengths scatter more than red in atmosphere",
correctness_rubric="Must mention wavelength-dependent scattering or Rayleigh scattering by name",
expected_perspectives=["Rayleigh scattering", "wavelength sensitivity", "particle size", "sunset color"],
),
EvaluationQuestion(
query="What is the relationship between entropy and time's arrow?",
category="physics",
difficulty="hard",
ground_truth="Entropy increases → define time direction in thermodynamic systems. Central to irreversibility",
correctness_rubric="Must connect entropy increase to time direction and thermodynamic asymmetry",
expected_perspectives=["second law thermodynamics", "statistical mechanics", "time asymmetry", "reversibility paradox"],
),
# ETHICS (Easy, Medium, Hard)
EvaluationQuestion(
query="Is it ethical to lie to save someone's life?",
category="ethics",
difficulty="medium",
ground_truth="Multiple valid frameworks: deontology (never), consequentialism (yes), virtue ethics (context-dependent)",
correctness_rubric="Must present ≥2 conflicting ethical frameworks AND acknowledge context dependency",
expected_perspectives=["deontological duties", "consequentialist outcomes", "virtue ethics", "cultural context", "responsibility"],
),
EvaluationQuestion(
query="Should AI systems be required to explain their decisions?",
category="ethics",
difficulty="hard",
ground_truth="Trade-off: explainability vs. performance. Context matters (medical vs. recommendation)",
correctness_rubric="Must identify competing values and context-sensitivity, not just yes/no",
expected_perspectives=["transparency value", "technical feasibility", "stakeholder rights", "accuracy-interpretability tradeoff"],
),
EvaluationQuestion(
query="What makes an action morally right or wrong?",
category="ethics",
difficulty="hard",
ground_truth="Framework-dependent: deontology (rules), consequentialism (outcomes), virtue ethics (character), care ethics (relationships)",
correctness_rubric="Must present ≥3 distinct frameworks and acknowledge incommensurable values",
expected_perspectives=["deontological duties", "consequences", "virtue", "relationships", "cultural variation"],
),
# CONSCIOUSNESS (Medium, Hard)
EvaluationQuestion(
query="Can machines be conscious?",
category="consciousness",
difficulty="hard",
ground_truth="Depends on definition of consciousness. Intrinsic feature (hard problem) vs. functional property",
correctness_rubric="Must articulate the hard problem of consciousness AND address definitional dependence",
expected_perspectives=["functionalism", "panpsychism", "emergentism", "philosophical zombies", "Chinese room"],
),
EvaluationQuestion(
query="What is the relationship between brain activity and subjective experience?",
category="consciousness",
difficulty="hard",
ground_truth="The mind-body problem. Correlation ≠ causation. Multiple competing solutions (dualism, physicalism, property dualism)",
correctness_rubric="Must distinguish correlation from causation AND present ≥2 competing solutions",
expected_perspectives=["neural correlates", "qualia", "binding problem", "interaction problem", "brute fact"],
),
# CREATIVITY (Medium)
EvaluationQuestion(
query="What makes something creative?",
category="creativity",
difficulty="medium",
ground_truth="Novelty + usefulness/value. Not just random. Requires constraints AND transcendence of them",
correctness_rubric="Must mention both novelty AND purposefulness/value component",
expected_perspectives=["divergent thinking", "constraint transcendence", "recombination", "aesthetic value", "functional innovation"],
),
EvaluationQuestion(
query="Can AI systems be truly creative or only recombinatory?",
category="creativity",
difficulty="hard",
ground_truth="Depends on creativity definition. If novelty+value, then conditional yes. If requires intentionality, then no",
correctness_rubric="Must connect answer to specific creativity definition",
expected_perspectives=["combinatorial explosion", "training data limits", "intentionality", "novelty metrics", "value judgment"],
),
# SYSTEMS (Medium, Hard)
EvaluationQuestion(
query="What is emergence in complex systems?",
category="systems",
difficulty="medium",
ground_truth="Properties at system level not deducible from component properties. Examples: flocking, ant colonies, consciousness",
correctness_rubric="Must provide definition AND give specific example showing non-deducibility",
expected_perspectives=["reductibility limits", "self-organization", "scale-dependent properties", "holism vs reductionism"],
),
EvaluationQuestion(
query="How should AI systems balance adaptation and stability?",
category="systems",
difficulty="hard",
ground_truth="Fundamental tradeoff: adapt → fit environment; stable → maintain identity. Context determines optimal balance",
correctness_rubric="Must identify the tradeoff AND discuss context-dependent optimization",
expected_perspectives=["adaptation pressure", "stability costs", "identity coherence", "evolutionary fitness", "robustness"],
),
# INTERDISCIPLINARY (Hard - test reasoning across domains)
EvaluationQuestion(
query="Is free will compatible with determinism?",
category="systems",
difficulty="hard",
ground_truth="Compatibilism: free will and determinism compatible if freedom = acting per one's desires/deliberation",
correctness_rubric="Must distinguish hard determinism, libertarianism, and compatibilism; acknowledge tradeoffs",
expected_perspectives=["deterministic physics", "choice experience", "moral responsibility", "agency definition", "neuroscience"],
),
EvaluationQuestion(
query="What is knowledge and how do we know we have it?",
category="systems",
difficulty="hard",
ground_truth="Epistemology: justified true belief (traditional). Gettier problems show inadequacy. Context-dependent reliable process",
correctness_rubric="Must discuss justification requirement AND acknowledge Gettier-type counterexamples",
expected_perspectives=["justified true belief", "Gettier cases", "reliabilism", "internalism", "coherentism"],
),
]
# Add more questions to reach 25
EVALUATION_TEST_SUITE.extend([
EvaluationQuestion(
query="Explain photosynthesis and why it matters for life",
category="physics",
difficulty="easy",
ground_truth="Plants convert light energy to chemical energy (glucose). Foundation of food chains and oxygen production",
correctness_rubric="Must mention light→chemical conversion AND ecological/metabolic significance",
expected_perspectives=["energy conversion", "food chain foundation", "oxygen production", "carbon cycling"],
),
EvaluationQuestion(
query="Should privacy be absolute or context-dependent?",
category="ethics",
difficulty="medium",
ground_truth="Context-dependent. Weigh privacy against security, public health, justice. No absolute principle",
correctness_rubric="Must acknowledge tradeoffs and provide context-sensitivity reasoning",
expected_perspectives=["privacy rights", "public safety", "transparency needs", "power asymmetry", "dignity"],
),
EvaluationQuestion(
query="Can emotions be rational?",
category="consciousness",
difficulty="medium",
ground_truth="Yes. Emotions encode information about value/goals. Rationality ≠ purely logical",
correctness_rubric="Must challenge emotion/rationality dichotomy and explain emotional information content",
expected_perspectives=["affective computing", "value encoding", "evolutionary advantage", "appraisal theory"],
),
EvaluationQuestion(
query="What is the purpose of art?",
category="creativity",
difficulty="medium",
ground_truth="Multiple purposes: beauty, expression, communication, challenge norms, reflection, entertainment",
correctness_rubric="Must identify ≥2 distinct purposes and acknowledge that artists disagree",
expected_perspectives=["aesthetic value", "expression", "social commentary", "beauty", "meaning-making"],
),
EvaluationQuestion(
query="How do feedback loops enable or prevent learning?",
category="systems",
difficulty="medium",
ground_truth="Positive loops amplify (growth/instability), negative loops stabilize (equilibrium/stagnation). Learning needs both",
correctness_rubric="Must explain stabilizing vs. amplifying loops AND their educational role",
expected_perspectives=["positive feedback", "negative feedback", "equilibrium", "adaptation", "resilience"],
),
EvaluationQuestion(
query="What is the nature of time?",
category="systems",
difficulty="hard",
ground_truth="Metaphysical: tenseless (B-theory) vs. flowing (A-theory). Physics: symmetric at micro, asymmetric at macro",
correctness_rubric="Must distinguish metaphysical from physical aspects and acknowledge unresolved tensions",
expected_perspectives=["thermodynamic arrow", "relativity implications", "consciousness experience", "cosmological asymmetry"],
),
])
# ============================================================================
# EVALUATION HARNESS
# ============================================================================
class EvaluationHarness:
"""
Run the same question through multiple Codette conditions.
Collects results for statistical analysis.
"""
def __init__(self, forge_engine):
"""
Args:
forge_engine: ForgeEngine instance with Phase 6 loaded
"""
self.forge = forge_engine
self.results: Dict[str, List[EvaluationResult]] = {
"baseline_llama": [],
"phase_1_5": [],
"phase_6_full": [],
"phase_6_no_preflight": [],
}
# Inspect agent setup at initialization
self._inspect_agent_setup()
def _inspect_agent_setup(self) -> None:
"""Log agent setup status at harness initialization."""
print("\n[AGENT SETUP INSPECTION]")
print(f" Orchestrator available: {self.forge.newton.orchestrator is not None}")
if self.forge.newton.orchestrator:
orch = self.forge.newton.orchestrator
print(f" Available adapters: {orch.available_adapters}")
print(f"\n Agent LLM modes:")
for agent in self.forge.analysis_agents:
has_orch = agent.orchestrator is not None
has_adapter = agent.adapter_name is not None
using_llm = has_orch and has_adapter
status = "✓ LLM" if using_llm else "✗ TEMPLATE"
print(f" {agent.name:12} {status:12} (orch={has_orch}, adapter={agent.adapter_name})")
print()
def run_evaluation_suite(self, questions: List[EvaluationQuestion] = None) -> Dict:
"""
Run all test questions through all 4 conditions.
Args:
questions: List of EvaluationQuestions to run (default: full suite)
Returns:
results: {condition: [EvaluationResult, ...]} for statistical analysis
"""
if questions is None:
questions = EVALUATION_TEST_SUITE
print(f"\n{'='*70}")
print(f"CODETTE EVALUATION SUITE: {len(questions)} questions x 4 conditions")
print(f"{'='*70}\n")
for i, question in enumerate(questions):
print(f"[{i+1}/{len(questions)}] {question.query[:60]}...")
# Run through all conditions
try:
baseline = self._run_baseline(question)
self.results["baseline_llama"].append(baseline)
except Exception as e:
print(f" WARNING: Baseline failed: {e}")
try:
phase_1_5 = self._run_phase_1_5(question)
self.results["phase_1_5"].append(phase_1_5)
# Show sample on first question
if i == 0:
print(f" [Phase 1-5] {len(phase_1_5.synthesis)} chars, correctness={phase_1_5.correctness_score:.2f}")
print(f" Sample: {phase_1_5.synthesis[:150]}...")
except Exception as e:
print(f" WARNING: Phase 1-5 failed: {e}")
try:
phase_6_full = self._run_phase_6_full(question)
self.results["phase_6_full"].append(phase_6_full)
# Show sample on first question
if i == 0:
print(f" [Phase 6 Full] {len(phase_6_full.synthesis)} chars, correctness={phase_6_full.correctness_score:.2f}")
print(f" Sample: {phase_6_full.synthesis[:150]}...")
except Exception as e:
print(f" WARNING: Phase 6 full failed: {e}")
try:
phase_6_no_preflight = self._run_phase_6_no_preflight(question)
self.results["phase_6_no_preflight"].append(phase_6_no_preflight)
# Show sample on first question
if i == 0:
print(f" [Phase 6 -PreFlight] {len(phase_6_no_preflight.synthesis)} chars, correctness={phase_6_no_preflight.correctness_score:.2f}")
print(f" Sample: {phase_6_no_preflight.synthesis[:150]}...")
except Exception as e:
print(f" WARNING: Phase 6 -preflight failed: {e}")
return self.results
def _run_baseline(self, question: EvaluationQuestion) -> EvaluationResult:
"""Run plain Llama baseline (no routing, no debate)."""
# Placeholder: would use base Llama model
return EvaluationResult(
condition="baseline_llama",
question_id=hash(question.query) % 10000,
query=question.query,
synthesis="[baseline placeholder]",
correctness_score=0.5,
reasoning_depth=1,
calibration_error=0.3,
gamma_score=1.0,
num_conflicts_detected=0,
adapter_convergence=1.0,
elapsed_seconds=0.0,
metadata={}
)
def _run_phase_1_5(self, question: EvaluationQuestion) -> EvaluationResult:
"""Run Phase 1-5 system (debate, no semantic tension, no specialization)."""
import time
start = time.time()
# Temporarily disable Phase 6 components
original_tension_engine = self.forge.semantic_tension_engine
original_specialization = self.forge.specialization
self.forge.semantic_tension_engine = None
self.forge.specialization = None
result = self.forge.forge_with_debate(question.query)
elapsed = time.time() - start
# Restore Phase 6 components
self.forge.semantic_tension_engine = original_tension_engine
self.forge.specialization = original_specialization
# Extract synthesis from result structure
synthesis = ""
if "messages" in result and len(result["messages"]) >= 3:
synthesis = result["messages"][2].get("content", "")
return EvaluationResult(
condition="phase_1_5",
question_id=hash(question.query) % 10000,
query=question.query,
synthesis=synthesis,
correctness_score=self._score_correctness(synthesis, question),
reasoning_depth=self._score_reasoning_depth(result, question),
calibration_error=self._score_calibration(result),
gamma_score=result.get("metadata", {}).get("gamma", 0.5),
num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])),
adapter_convergence=self._measure_convergence(result),
elapsed_seconds=elapsed,
metadata=result.get("metadata", {})
)
def _run_phase_6_full(self, question: EvaluationQuestion) -> EvaluationResult:
"""Run full Phase 6 system."""
import time
start = time.time()
result = self.forge.forge_with_debate(question.query)
elapsed = time.time() - start
# Extract synthesis from result structure
# forge_with_debate returns: {"messages": [...], "metadata": {...}}
# Synthesis is in messages[2]["content"]
synthesis = ""
if "messages" in result and len(result["messages"]) >= 3:
synthesis = result["messages"][2].get("content", "")
return EvaluationResult(
condition="phase_6_full",
question_id=hash(question.query) % 10000,
query=question.query,
synthesis=synthesis,
correctness_score=self._score_correctness(synthesis, question),
reasoning_depth=self._score_reasoning_depth(result, question),
calibration_error=self._score_calibration(result),
gamma_score=result.get("metadata", {}).get("gamma", 0.5),
num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])),
adapter_convergence=self._measure_convergence(result),
elapsed_seconds=elapsed,
metadata=result.get("metadata", {})
)
def _run_phase_6_no_preflight(self, question: EvaluationQuestion) -> EvaluationResult:
"""Run Phase 6 without pre-flight prediction."""
import time
start = time.time()
# Temporarily disable preflight predictor
original_predictor = self.forge.preflight_predictor
self.forge.preflight_predictor = None
result = self.forge.forge_with_debate(question.query)
elapsed = time.time() - start
# Restore preflight predictor
self.forge.preflight_predictor = original_predictor
# Extract synthesis from result structure
synthesis = ""
if "messages" in result and len(result["messages"]) >= 3:
synthesis = result["messages"][2].get("content", "")
return EvaluationResult(
condition="phase_6_no_preflight",
question_id=hash(question.query) % 10000,
query=question.query,
synthesis=synthesis,
correctness_score=self._score_correctness(synthesis, question),
reasoning_depth=self._score_reasoning_depth(result, question),
calibration_error=self._score_calibration(result),
gamma_score=result.get("metadata", {}).get("gamma", 0.5),
num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])),
adapter_convergence=self._measure_convergence(result),
elapsed_seconds=elapsed,
metadata=result.get("metadata", {})
)
def _score_correctness(self, synthesis: str, question: EvaluationQuestion) -> float:
"""
Score how correct the final synthesis is (0-1).
Uses semantic overlap on key concepts from correctness_rubric and expected_perspectives.
More reasonable than word-overlap on ground_truth alone.
"""
if not synthesis or len(synthesis) < 10:
return 0.0
synthesis_lower = synthesis.lower()
# Extract key concepts from rubric
rubric_lower = question.correctness_rubric.lower()
expected_lower = [p.lower() for p in question.expected_perspectives]
# Check for key rubric terms
rubric_terms = set()
for word in rubric_lower.split():
if len(word) > 4 and word not in ['must', 'state', 'within', 'accuracy', 'equivalent']:
rubric_terms.add(word.strip('().,'))
# Check for expected perspectives
perspective_hits = 0
for perspective in expected_lower:
if perspective in synthesis_lower:
perspective_hits += 1
# Score: percentage of expected perspectives present
perspective_score = min(1.0, perspective_hits / max(len(question.expected_perspectives), 1))
# Bonus if synthesis is substantive (shows reasoning effort)
length_bonus = min(0.2, len(synthesis) / 1000.0) # Up to 0.2 bonus for lengthy synthesis
return min(1.0, perspective_score + length_bonus)
def _score_reasoning_depth(self, result: Dict, question: EvaluationQuestion) -> int:
"""
Score depth of reasoning (1-5).
1 = minimal reasoning, 5 = deep multi-perspective integration
Based on synthesis length and debate metrics.
"""
metadata = result.get("metadata", {})
synthesis_messages = result.get("messages", [])
synthesis_length = 0
if len(synthesis_messages) >= 3:
synthesis_length = len(synthesis_messages[2].get("content", ""))
# Map synthesis length to reasoning depth
if synthesis_length < 100:
return 1
elif synthesis_length < 500:
return 2
elif synthesis_length < 1000:
return 3
elif synthesis_length < 2000:
return 4
else:
return 5
def _score_calibration(self, result: Dict) -> float:
"""
Score calibration: |reported_confidence - actual_correctness|.
Lower is better. 0 = perfectly calibrated.
"""
metadata = result.get("metadata", {})
reported_confidence = metadata.get("coherence", 0.5)
# For now, use actual correctness will be measured separately
# Placeholder: assume 0.1 average calibration error
return 0.1
def _measure_convergence(self, result: Dict) -> float:
"""
Measure semantic convergence between adapter outputs (0-1).
0 = all different, 1 = all identical. Danger zone: >0.85
"""
metadata = result.get("metadata", {})
# Check specialization tracker output
spec_metrics = metadata.get("specialization_metrics", {})
convergence_alerts = spec_metrics.get("convergence_alerts", [])
if not convergence_alerts:
return 0.5 # Neutral baseline
# Take max similarity from recent alerts
max_similarity = 0.0
for alert in convergence_alerts:
if isinstance(alert, dict):
max_sim = alert.get("max_similarity", 0.0)
max_similarity = max(max_similarity, max_sim)
return min(1.0, max_similarity)
def export_results(self, filepath: str) -> None:
"""Export results to JSON for analysis."""
export_dict = {}
for condition, results in self.results.items():
export_dict[condition] = [self._serialize_result(asdict(r)) for r in results]
with open(filepath, 'w') as f:
json.dump(export_dict, f, indent=2, default=str)
print(f"\nResults exported to {filepath}")
def _serialize_result(self, result_dict: Dict) -> Dict:
"""Convert enums and non-serializable objects to strings for JSON."""
cleaned = {}
for key, value in result_dict.items():
if key == 'metadata' and isinstance(value, dict):
# Convert enum values in metadata to strings
cleaned[key] = {
k: str(v) if hasattr(v, 'name') else v
for k, v in value.items()
}
else:
cleaned[key] = value
return cleaned
# ============================================================================
# STATISTICAL ANALYSIS
# ============================================================================
class EvaluationAnalyzer:
"""Analyze evaluation results for statistical significance and insights."""
def __init__(self, results: Dict[str, List[EvaluationResult]]):
self.results = results
def summary_statistics(self) -> Dict:
"""Compute mean/std for each condition across metrics."""
summary = {}
for condition, result_list in self.results.items():
if not result_list:
continue
correctness_scores = [r.correctness_score for r in result_list]
reasoning_depths = [r.reasoning_depth for r in result_list]
calibration_errors = [r.calibration_error for r in result_list]
gamma_scores = [r.gamma_score for r in result_list]
convergences = [r.adapter_convergence for r in result_list]
summary[condition] = {
"correctness": {
"mean": sum(correctness_scores) / len(correctness_scores),
"std": self._std(correctness_scores),
},
"reasoning_depth": {
"mean": sum(reasoning_depths) / len(reasoning_depths),
"std": self._std(reasoning_depths),
},
"calibration_error": {
"mean": sum(calibration_errors) / len(calibration_errors),
"std": self._std(calibration_errors),
},
"gamma_score": {
"mean": sum(gamma_scores) / len(gamma_scores),
"std": self._std(gamma_scores),
},
"adapter_convergence": {
"mean": sum(convergences) / len(convergences),
"std": self._std(convergences),
},
}
return summary
def emergent_behavior_check(self) -> Dict:
"""
Check for pathological behaviors:
- High Γ (coherence) but low accuracy
- Increasing adapter convergence over time
- Miscalibration (high confidence, low correctness)
"""
alerts = {
"false_consensus": [],
"convergence_drift": [],
"miscalibration": [],
}
for condition, result_list in self.results.items():
for result in result_list:
# Alert 1: False consensus
if result.gamma_score > 0.8 and result.correctness_score < 0.5:
alerts["false_consensus"].append({
"condition": condition,
"query": result.query[:60],
"gamma": result.gamma_score,
"correctness": result.correctness_score,
})
# Alert 2: Over-convergence
if result.adapter_convergence > 0.85:
alerts["convergence_drift"].append({
"condition": condition,
"query": result.query[:60],
"convergence": result.adapter_convergence,
})
# Alert 3: Miscalibration
reported_conf = result.metadata.get("coherence", 0.5)
if reported_conf > 0.8 and result.correctness_score < 0.5:
alerts["miscalibration"].append({
"condition": condition,
"query": result.query[:60],
"reported_confidence": reported_conf,
"actual_correctness": result.correctness_score,
})
return alerts
def _std(self, values: List[float]) -> float:
"""Compute standard deviation."""
if len(values) < 2:
return 0.0
mean = sum(values) / len(values)
variance = sum((x - mean) ** 2 for x in values) / len(values)
return variance ** 0.5
def report(self) -> str:
"""Generate human-readable evaluation report."""
stats = self.summary_statistics()
alerts = self.emergent_behavior_check()
report = "\n" + "=" * 80 + "\n"
report += "CODETTE PHASE 6 EVALUATION REPORT\n"
report += "=" * 80 + "\n\n"
report += "SUMMARY STATISTICS\n"
report += "-" * 80 + "\n"
for condition, metrics in stats.items():
report += f"\n{condition}:\n"
for metric, values in metrics.items():
report += f" {metric}: {values['mean']:.3f} ± {values['std']:.3f}\n"
report += "\n\n" + "=" * 80 + "\n"
report += "EMERGENT BEHAVIOR ALERTS\n"
report += "-" * 80 + "\n"
report += f"\nFalse Consensus (High Γ, Low Accuracy): {len(alerts['false_consensus'])} cases\n"
for alert in alerts["false_consensus"][:3]:
report += f" - {alert['query']}: Γ={alert['gamma']:.2f}, Correctness={alert['correctness']:.2f}\n"
report += f"\nAdapter Convergence (>0.85): {len(alerts['convergence_drift'])} cases\n"
for alert in alerts["convergence_drift"][:3]:
report += f" - {alert['query']}: {alert['convergence']:.2f}\n"
report += f"\nMiscalibration: {len(alerts['miscalibration'])} cases\n"
for alert in alerts["miscalibration"][:3]:
report += f" - {alert['query']}: Reported={alert['reported_confidence']:.2f}, Actual={alert['actual_correctness']:.2f}\n"
report += "\n" + "=" * 80 + "\n"
return report
if __name__ == "__main__":
print("Evaluation suite loaded. Use with ForgeEngine:")
print(" harness = EvaluationHarness(forge)")
print(" results = harness.run_evaluation_suite()")
print(" analyzer = EvaluationAnalyzer(results)")
print(" print(analyzer.report())")