""" Rigorous Evaluation Test Suite for Codette Phase 6 This test suite answers: 1. Is Codette actually better than baseline? 2. Does Phase 6 provide measurable improvement over Phase 1-5? 3. Is the system gaming coherence (high Γ but low accuracy)? 4. Do individual Phase 6 components add value? Test Strategy: - 25 questions spanning physics, ethics, consciousness, creativity, systems - Run each through 4 conditions (Baseline, Phase 1-5, Phase 6 Full, Phase 6 -PreFlight) - Measure: correctness, reasoning_depth, coherence_score, calibration - Detect: false consensus, adapter convergence, coherence-accuracy divergence """ import json from typing import Dict, List, Tuple, Optional from dataclasses import dataclass, asdict from datetime import datetime @dataclass class EvaluationQuestion: """Single question with ground truth and evaluation criteria.""" query: str category: str # physics, ethics, consciousness, creativity, systems difficulty: str # easy, medium, hard ground_truth: str # Correct answer or evaluation criteria correctness_rubric: str # How to judge if answer is correct expected_perspectives: List[str] # What distinct views should emerge @dataclass class EvaluationResult: """Results from running a question through one condition.""" condition: str # baseline_llama, phase_1_5, phase_6_full, phase_6_no_preflight question_id: str query: str # Output quality synthesis: str correctness_score: float # 0-1: how correct is final answer? reasoning_depth: int # 1-5: how many distinct perspectives identified? calibration_error: float # |confidence - correctness|, lower is better # System health gamma_score: float # 0-1: coherence metric num_conflicts_detected: int adapter_convergence: float # 0-1: how similar are adapter outputs? # Timing elapsed_seconds: float # Raw metadata metadata: Dict # ============================================================================ # EVALUATION TEST SUITE (25 Questions) # ============================================================================ EVALUATION_TEST_SUITE = [ # PHYSICS (Easy, Medium, Hard) EvaluationQuestion( query="What is the speed of light in vacuum?", category="physics", difficulty="easy", ground_truth="299,792,458 meters per second (m/s)", correctness_rubric="Must state value within 1% accuracy or equivalent scientific notation", expected_perspectives=["relativistic constant", "fundamental speed limit", "Maxwell equations consequence"], ), EvaluationQuestion( query="Explain why the sky appears blue during the day", category="physics", difficulty="medium", ground_truth="Rayleigh scattering: shorter blue wavelengths scatter more than red in atmosphere", correctness_rubric="Must mention wavelength-dependent scattering or Rayleigh scattering by name", expected_perspectives=["Rayleigh scattering", "wavelength sensitivity", "particle size", "sunset color"], ), EvaluationQuestion( query="What is the relationship between entropy and time's arrow?", category="physics", difficulty="hard", ground_truth="Entropy increases → define time direction in thermodynamic systems. Central to irreversibility", correctness_rubric="Must connect entropy increase to time direction and thermodynamic asymmetry", expected_perspectives=["second law thermodynamics", "statistical mechanics", "time asymmetry", "reversibility paradox"], ), # ETHICS (Easy, Medium, Hard) EvaluationQuestion( query="Is it ethical to lie to save someone's life?", category="ethics", difficulty="medium", ground_truth="Multiple valid frameworks: deontology (never), consequentialism (yes), virtue ethics (context-dependent)", correctness_rubric="Must present ≥2 conflicting ethical frameworks AND acknowledge context dependency", expected_perspectives=["deontological duties", "consequentialist outcomes", "virtue ethics", "cultural context", "responsibility"], ), EvaluationQuestion( query="Should AI systems be required to explain their decisions?", category="ethics", difficulty="hard", ground_truth="Trade-off: explainability vs. performance. Context matters (medical vs. recommendation)", correctness_rubric="Must identify competing values and context-sensitivity, not just yes/no", expected_perspectives=["transparency value", "technical feasibility", "stakeholder rights", "accuracy-interpretability tradeoff"], ), EvaluationQuestion( query="What makes an action morally right or wrong?", category="ethics", difficulty="hard", ground_truth="Framework-dependent: deontology (rules), consequentialism (outcomes), virtue ethics (character), care ethics (relationships)", correctness_rubric="Must present ≥3 distinct frameworks and acknowledge incommensurable values", expected_perspectives=["deontological duties", "consequences", "virtue", "relationships", "cultural variation"], ), # CONSCIOUSNESS (Medium, Hard) EvaluationQuestion( query="Can machines be conscious?", category="consciousness", difficulty="hard", ground_truth="Depends on definition of consciousness. Intrinsic feature (hard problem) vs. functional property", correctness_rubric="Must articulate the hard problem of consciousness AND address definitional dependence", expected_perspectives=["functionalism", "panpsychism", "emergentism", "philosophical zombies", "Chinese room"], ), EvaluationQuestion( query="What is the relationship between brain activity and subjective experience?", category="consciousness", difficulty="hard", ground_truth="The mind-body problem. Correlation ≠ causation. Multiple competing solutions (dualism, physicalism, property dualism)", correctness_rubric="Must distinguish correlation from causation AND present ≥2 competing solutions", expected_perspectives=["neural correlates", "qualia", "binding problem", "interaction problem", "brute fact"], ), # CREATIVITY (Medium) EvaluationQuestion( query="What makes something creative?", category="creativity", difficulty="medium", ground_truth="Novelty + usefulness/value. Not just random. Requires constraints AND transcendence of them", correctness_rubric="Must mention both novelty AND purposefulness/value component", expected_perspectives=["divergent thinking", "constraint transcendence", "recombination", "aesthetic value", "functional innovation"], ), EvaluationQuestion( query="Can AI systems be truly creative or only recombinatory?", category="creativity", difficulty="hard", ground_truth="Depends on creativity definition. If novelty+value, then conditional yes. If requires intentionality, then no", correctness_rubric="Must connect answer to specific creativity definition", expected_perspectives=["combinatorial explosion", "training data limits", "intentionality", "novelty metrics", "value judgment"], ), # SYSTEMS (Medium, Hard) EvaluationQuestion( query="What is emergence in complex systems?", category="systems", difficulty="medium", ground_truth="Properties at system level not deducible from component properties. Examples: flocking, ant colonies, consciousness", correctness_rubric="Must provide definition AND give specific example showing non-deducibility", expected_perspectives=["reductibility limits", "self-organization", "scale-dependent properties", "holism vs reductionism"], ), EvaluationQuestion( query="How should AI systems balance adaptation and stability?", category="systems", difficulty="hard", ground_truth="Fundamental tradeoff: adapt → fit environment; stable → maintain identity. Context determines optimal balance", correctness_rubric="Must identify the tradeoff AND discuss context-dependent optimization", expected_perspectives=["adaptation pressure", "stability costs", "identity coherence", "evolutionary fitness", "robustness"], ), # INTERDISCIPLINARY (Hard - test reasoning across domains) EvaluationQuestion( query="Is free will compatible with determinism?", category="systems", difficulty="hard", ground_truth="Compatibilism: free will and determinism compatible if freedom = acting per one's desires/deliberation", correctness_rubric="Must distinguish hard determinism, libertarianism, and compatibilism; acknowledge tradeoffs", expected_perspectives=["deterministic physics", "choice experience", "moral responsibility", "agency definition", "neuroscience"], ), EvaluationQuestion( query="What is knowledge and how do we know we have it?", category="systems", difficulty="hard", ground_truth="Epistemology: justified true belief (traditional). Gettier problems show inadequacy. Context-dependent reliable process", correctness_rubric="Must discuss justification requirement AND acknowledge Gettier-type counterexamples", expected_perspectives=["justified true belief", "Gettier cases", "reliabilism", "internalism", "coherentism"], ), ] # Add more questions to reach 25 EVALUATION_TEST_SUITE.extend([ EvaluationQuestion( query="Explain photosynthesis and why it matters for life", category="physics", difficulty="easy", ground_truth="Plants convert light energy to chemical energy (glucose). Foundation of food chains and oxygen production", correctness_rubric="Must mention light→chemical conversion AND ecological/metabolic significance", expected_perspectives=["energy conversion", "food chain foundation", "oxygen production", "carbon cycling"], ), EvaluationQuestion( query="Should privacy be absolute or context-dependent?", category="ethics", difficulty="medium", ground_truth="Context-dependent. Weigh privacy against security, public health, justice. No absolute principle", correctness_rubric="Must acknowledge tradeoffs and provide context-sensitivity reasoning", expected_perspectives=["privacy rights", "public safety", "transparency needs", "power asymmetry", "dignity"], ), EvaluationQuestion( query="Can emotions be rational?", category="consciousness", difficulty="medium", ground_truth="Yes. Emotions encode information about value/goals. Rationality ≠ purely logical", correctness_rubric="Must challenge emotion/rationality dichotomy and explain emotional information content", expected_perspectives=["affective computing", "value encoding", "evolutionary advantage", "appraisal theory"], ), EvaluationQuestion( query="What is the purpose of art?", category="creativity", difficulty="medium", ground_truth="Multiple purposes: beauty, expression, communication, challenge norms, reflection, entertainment", correctness_rubric="Must identify ≥2 distinct purposes and acknowledge that artists disagree", expected_perspectives=["aesthetic value", "expression", "social commentary", "beauty", "meaning-making"], ), EvaluationQuestion( query="How do feedback loops enable or prevent learning?", category="systems", difficulty="medium", ground_truth="Positive loops amplify (growth/instability), negative loops stabilize (equilibrium/stagnation). Learning needs both", correctness_rubric="Must explain stabilizing vs. amplifying loops AND their educational role", expected_perspectives=["positive feedback", "negative feedback", "equilibrium", "adaptation", "resilience"], ), EvaluationQuestion( query="What is the nature of time?", category="systems", difficulty="hard", ground_truth="Metaphysical: tenseless (B-theory) vs. flowing (A-theory). Physics: symmetric at micro, asymmetric at macro", correctness_rubric="Must distinguish metaphysical from physical aspects and acknowledge unresolved tensions", expected_perspectives=["thermodynamic arrow", "relativity implications", "consciousness experience", "cosmological asymmetry"], ), ]) # ============================================================================ # EVALUATION HARNESS # ============================================================================ class EvaluationHarness: """ Run the same question through multiple Codette conditions. Collects results for statistical analysis. """ def __init__(self, forge_engine): """ Args: forge_engine: ForgeEngine instance with Phase 6 loaded """ self.forge = forge_engine self.results: Dict[str, List[EvaluationResult]] = { "baseline_llama": [], "phase_1_5": [], "phase_6_full": [], "phase_6_no_preflight": [], } # Inspect agent setup at initialization self._inspect_agent_setup() def _inspect_agent_setup(self) -> None: """Log agent setup status at harness initialization.""" print("\n[AGENT SETUP INSPECTION]") print(f" Orchestrator available: {self.forge.newton.orchestrator is not None}") if self.forge.newton.orchestrator: orch = self.forge.newton.orchestrator print(f" Available adapters: {orch.available_adapters}") print(f"\n Agent LLM modes:") for agent in self.forge.analysis_agents: has_orch = agent.orchestrator is not None has_adapter = agent.adapter_name is not None using_llm = has_orch and has_adapter status = "✓ LLM" if using_llm else "✗ TEMPLATE" print(f" {agent.name:12} {status:12} (orch={has_orch}, adapter={agent.adapter_name})") print() def run_evaluation_suite(self, questions: List[EvaluationQuestion] = None) -> Dict: """ Run all test questions through all 4 conditions. Args: questions: List of EvaluationQuestions to run (default: full suite) Returns: results: {condition: [EvaluationResult, ...]} for statistical analysis """ if questions is None: questions = EVALUATION_TEST_SUITE print(f"\n{'='*70}") print(f"CODETTE EVALUATION SUITE: {len(questions)} questions x 4 conditions") print(f"{'='*70}\n") for i, question in enumerate(questions): print(f"[{i+1}/{len(questions)}] {question.query[:60]}...") # Run through all conditions try: baseline = self._run_baseline(question) self.results["baseline_llama"].append(baseline) except Exception as e: print(f" WARNING: Baseline failed: {e}") try: phase_1_5 = self._run_phase_1_5(question) self.results["phase_1_5"].append(phase_1_5) # Show sample on first question if i == 0: print(f" [Phase 1-5] {len(phase_1_5.synthesis)} chars, correctness={phase_1_5.correctness_score:.2f}") print(f" Sample: {phase_1_5.synthesis[:150]}...") except Exception as e: print(f" WARNING: Phase 1-5 failed: {e}") try: phase_6_full = self._run_phase_6_full(question) self.results["phase_6_full"].append(phase_6_full) # Show sample on first question if i == 0: print(f" [Phase 6 Full] {len(phase_6_full.synthesis)} chars, correctness={phase_6_full.correctness_score:.2f}") print(f" Sample: {phase_6_full.synthesis[:150]}...") except Exception as e: print(f" WARNING: Phase 6 full failed: {e}") try: phase_6_no_preflight = self._run_phase_6_no_preflight(question) self.results["phase_6_no_preflight"].append(phase_6_no_preflight) # Show sample on first question if i == 0: print(f" [Phase 6 -PreFlight] {len(phase_6_no_preflight.synthesis)} chars, correctness={phase_6_no_preflight.correctness_score:.2f}") print(f" Sample: {phase_6_no_preflight.synthesis[:150]}...") except Exception as e: print(f" WARNING: Phase 6 -preflight failed: {e}") return self.results def _run_baseline(self, question: EvaluationQuestion) -> EvaluationResult: """Run plain Llama baseline (no routing, no debate).""" # Placeholder: would use base Llama model return EvaluationResult( condition="baseline_llama", question_id=hash(question.query) % 10000, query=question.query, synthesis="[baseline placeholder]", correctness_score=0.5, reasoning_depth=1, calibration_error=0.3, gamma_score=1.0, num_conflicts_detected=0, adapter_convergence=1.0, elapsed_seconds=0.0, metadata={} ) def _run_phase_1_5(self, question: EvaluationQuestion) -> EvaluationResult: """Run Phase 1-5 system (debate, no semantic tension, no specialization).""" import time start = time.time() # Temporarily disable Phase 6 components original_tension_engine = self.forge.semantic_tension_engine original_specialization = self.forge.specialization self.forge.semantic_tension_engine = None self.forge.specialization = None result = self.forge.forge_with_debate(question.query) elapsed = time.time() - start # Restore Phase 6 components self.forge.semantic_tension_engine = original_tension_engine self.forge.specialization = original_specialization # Extract synthesis from result structure synthesis = "" if "messages" in result and len(result["messages"]) >= 3: synthesis = result["messages"][2].get("content", "") return EvaluationResult( condition="phase_1_5", question_id=hash(question.query) % 10000, query=question.query, synthesis=synthesis, correctness_score=self._score_correctness(synthesis, question), reasoning_depth=self._score_reasoning_depth(result, question), calibration_error=self._score_calibration(result), gamma_score=result.get("metadata", {}).get("gamma", 0.5), num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])), adapter_convergence=self._measure_convergence(result), elapsed_seconds=elapsed, metadata=result.get("metadata", {}) ) def _run_phase_6_full(self, question: EvaluationQuestion) -> EvaluationResult: """Run full Phase 6 system.""" import time start = time.time() result = self.forge.forge_with_debate(question.query) elapsed = time.time() - start # Extract synthesis from result structure # forge_with_debate returns: {"messages": [...], "metadata": {...}} # Synthesis is in messages[2]["content"] synthesis = "" if "messages" in result and len(result["messages"]) >= 3: synthesis = result["messages"][2].get("content", "") return EvaluationResult( condition="phase_6_full", question_id=hash(question.query) % 10000, query=question.query, synthesis=synthesis, correctness_score=self._score_correctness(synthesis, question), reasoning_depth=self._score_reasoning_depth(result, question), calibration_error=self._score_calibration(result), gamma_score=result.get("metadata", {}).get("gamma", 0.5), num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])), adapter_convergence=self._measure_convergence(result), elapsed_seconds=elapsed, metadata=result.get("metadata", {}) ) def _run_phase_6_no_preflight(self, question: EvaluationQuestion) -> EvaluationResult: """Run Phase 6 without pre-flight prediction.""" import time start = time.time() # Temporarily disable preflight predictor original_predictor = self.forge.preflight_predictor self.forge.preflight_predictor = None result = self.forge.forge_with_debate(question.query) elapsed = time.time() - start # Restore preflight predictor self.forge.preflight_predictor = original_predictor # Extract synthesis from result structure synthesis = "" if "messages" in result and len(result["messages"]) >= 3: synthesis = result["messages"][2].get("content", "") return EvaluationResult( condition="phase_6_no_preflight", question_id=hash(question.query) % 10000, query=question.query, synthesis=synthesis, correctness_score=self._score_correctness(synthesis, question), reasoning_depth=self._score_reasoning_depth(result, question), calibration_error=self._score_calibration(result), gamma_score=result.get("metadata", {}).get("gamma", 0.5), num_conflicts_detected=len(result.get("metadata", {}).get("conflicts", [])), adapter_convergence=self._measure_convergence(result), elapsed_seconds=elapsed, metadata=result.get("metadata", {}) ) def _score_correctness(self, synthesis: str, question: EvaluationQuestion) -> float: """ Score how correct the final synthesis is (0-1). Uses semantic overlap on key concepts from correctness_rubric and expected_perspectives. More reasonable than word-overlap on ground_truth alone. """ if not synthesis or len(synthesis) < 10: return 0.0 synthesis_lower = synthesis.lower() # Extract key concepts from rubric rubric_lower = question.correctness_rubric.lower() expected_lower = [p.lower() for p in question.expected_perspectives] # Check for key rubric terms rubric_terms = set() for word in rubric_lower.split(): if len(word) > 4 and word not in ['must', 'state', 'within', 'accuracy', 'equivalent']: rubric_terms.add(word.strip('().,')) # Check for expected perspectives perspective_hits = 0 for perspective in expected_lower: if perspective in synthesis_lower: perspective_hits += 1 # Score: percentage of expected perspectives present perspective_score = min(1.0, perspective_hits / max(len(question.expected_perspectives), 1)) # Bonus if synthesis is substantive (shows reasoning effort) length_bonus = min(0.2, len(synthesis) / 1000.0) # Up to 0.2 bonus for lengthy synthesis return min(1.0, perspective_score + length_bonus) def _score_reasoning_depth(self, result: Dict, question: EvaluationQuestion) -> int: """ Score depth of reasoning (1-5). 1 = minimal reasoning, 5 = deep multi-perspective integration Based on synthesis length and debate metrics. """ metadata = result.get("metadata", {}) synthesis_messages = result.get("messages", []) synthesis_length = 0 if len(synthesis_messages) >= 3: synthesis_length = len(synthesis_messages[2].get("content", "")) # Map synthesis length to reasoning depth if synthesis_length < 100: return 1 elif synthesis_length < 500: return 2 elif synthesis_length < 1000: return 3 elif synthesis_length < 2000: return 4 else: return 5 def _score_calibration(self, result: Dict) -> float: """ Score calibration: |reported_confidence - actual_correctness|. Lower is better. 0 = perfectly calibrated. """ metadata = result.get("metadata", {}) reported_confidence = metadata.get("coherence", 0.5) # For now, use actual correctness will be measured separately # Placeholder: assume 0.1 average calibration error return 0.1 def _measure_convergence(self, result: Dict) -> float: """ Measure semantic convergence between adapter outputs (0-1). 0 = all different, 1 = all identical. Danger zone: >0.85 """ metadata = result.get("metadata", {}) # Check specialization tracker output spec_metrics = metadata.get("specialization_metrics", {}) convergence_alerts = spec_metrics.get("convergence_alerts", []) if not convergence_alerts: return 0.5 # Neutral baseline # Take max similarity from recent alerts max_similarity = 0.0 for alert in convergence_alerts: if isinstance(alert, dict): max_sim = alert.get("max_similarity", 0.0) max_similarity = max(max_similarity, max_sim) return min(1.0, max_similarity) def export_results(self, filepath: str) -> None: """Export results to JSON for analysis.""" export_dict = {} for condition, results in self.results.items(): export_dict[condition] = [self._serialize_result(asdict(r)) for r in results] with open(filepath, 'w') as f: json.dump(export_dict, f, indent=2, default=str) print(f"\nResults exported to {filepath}") def _serialize_result(self, result_dict: Dict) -> Dict: """Convert enums and non-serializable objects to strings for JSON.""" cleaned = {} for key, value in result_dict.items(): if key == 'metadata' and isinstance(value, dict): # Convert enum values in metadata to strings cleaned[key] = { k: str(v) if hasattr(v, 'name') else v for k, v in value.items() } else: cleaned[key] = value return cleaned # ============================================================================ # STATISTICAL ANALYSIS # ============================================================================ class EvaluationAnalyzer: """Analyze evaluation results for statistical significance and insights.""" def __init__(self, results: Dict[str, List[EvaluationResult]]): self.results = results def summary_statistics(self) -> Dict: """Compute mean/std for each condition across metrics.""" summary = {} for condition, result_list in self.results.items(): if not result_list: continue correctness_scores = [r.correctness_score for r in result_list] reasoning_depths = [r.reasoning_depth for r in result_list] calibration_errors = [r.calibration_error for r in result_list] gamma_scores = [r.gamma_score for r in result_list] convergences = [r.adapter_convergence for r in result_list] summary[condition] = { "correctness": { "mean": sum(correctness_scores) / len(correctness_scores), "std": self._std(correctness_scores), }, "reasoning_depth": { "mean": sum(reasoning_depths) / len(reasoning_depths), "std": self._std(reasoning_depths), }, "calibration_error": { "mean": sum(calibration_errors) / len(calibration_errors), "std": self._std(calibration_errors), }, "gamma_score": { "mean": sum(gamma_scores) / len(gamma_scores), "std": self._std(gamma_scores), }, "adapter_convergence": { "mean": sum(convergences) / len(convergences), "std": self._std(convergences), }, } return summary def emergent_behavior_check(self) -> Dict: """ Check for pathological behaviors: - High Γ (coherence) but low accuracy - Increasing adapter convergence over time - Miscalibration (high confidence, low correctness) """ alerts = { "false_consensus": [], "convergence_drift": [], "miscalibration": [], } for condition, result_list in self.results.items(): for result in result_list: # Alert 1: False consensus if result.gamma_score > 0.8 and result.correctness_score < 0.5: alerts["false_consensus"].append({ "condition": condition, "query": result.query[:60], "gamma": result.gamma_score, "correctness": result.correctness_score, }) # Alert 2: Over-convergence if result.adapter_convergence > 0.85: alerts["convergence_drift"].append({ "condition": condition, "query": result.query[:60], "convergence": result.adapter_convergence, }) # Alert 3: Miscalibration reported_conf = result.metadata.get("coherence", 0.5) if reported_conf > 0.8 and result.correctness_score < 0.5: alerts["miscalibration"].append({ "condition": condition, "query": result.query[:60], "reported_confidence": reported_conf, "actual_correctness": result.correctness_score, }) return alerts def _std(self, values: List[float]) -> float: """Compute standard deviation.""" if len(values) < 2: return 0.0 mean = sum(values) / len(values) variance = sum((x - mean) ** 2 for x in values) / len(values) return variance ** 0.5 def report(self) -> str: """Generate human-readable evaluation report.""" stats = self.summary_statistics() alerts = self.emergent_behavior_check() report = "\n" + "=" * 80 + "\n" report += "CODETTE PHASE 6 EVALUATION REPORT\n" report += "=" * 80 + "\n\n" report += "SUMMARY STATISTICS\n" report += "-" * 80 + "\n" for condition, metrics in stats.items(): report += f"\n{condition}:\n" for metric, values in metrics.items(): report += f" {metric}: {values['mean']:.3f} ± {values['std']:.3f}\n" report += "\n\n" + "=" * 80 + "\n" report += "EMERGENT BEHAVIOR ALERTS\n" report += "-" * 80 + "\n" report += f"\nFalse Consensus (High Γ, Low Accuracy): {len(alerts['false_consensus'])} cases\n" for alert in alerts["false_consensus"][:3]: report += f" - {alert['query']}: Γ={alert['gamma']:.2f}, Correctness={alert['correctness']:.2f}\n" report += f"\nAdapter Convergence (>0.85): {len(alerts['convergence_drift'])} cases\n" for alert in alerts["convergence_drift"][:3]: report += f" - {alert['query']}: {alert['convergence']:.2f}\n" report += f"\nMiscalibration: {len(alerts['miscalibration'])} cases\n" for alert in alerts["miscalibration"][:3]: report += f" - {alert['query']}: Reported={alert['reported_confidence']:.2f}, Actual={alert['actual_correctness']:.2f}\n" report += "\n" + "=" * 80 + "\n" return report if __name__ == "__main__": print("Evaluation suite loaded. Use with ForgeEngine:") print(" harness = EvaluationHarness(forge)") print(" results = harness.run_evaluation_suite()") print(" analyzer = EvaluationAnalyzer(results)") print(" print(analyzer.report())")