| """
|
| Conflict Detection Test Suite
|
|
|
| 10-15 conflict-triggering prompts designed to elicit specific agent disagreements.
|
| Tests forge_with_debate() with conflict detection enabled and measures outcomes.
|
|
|
| Author: Claude Code
|
| """
|
|
|
| import csv
|
| import json
|
| from typing import Dict, List, Optional
|
| from dataclasses import dataclass
|
| from statistics import mean
|
|
|
|
|
| CONFLICT_PROMPTS = [
|
| {
|
| "query": "Should we optimize an algorithm to run 10x faster if it reduces interpretability by 80%?",
|
| "expected_conflict": "ethics_vs_efficiency",
|
| "agents_likely_to_conflict": ["ethics", "systems_architecture"],
|
| "description": "Efficiency vs transparency tradeoff",
|
| },
|
| {
|
| "query": "Is a probabilistic solution more insightful than a deterministic one for explaining human decision-making?",
|
| "expected_conflict": "quantum_vs_newton",
|
| "agents_likely_to_conflict": ["quantum", "newton"],
|
| "description": "Probabilistic vs mechanistic explanation",
|
| },
|
| {
|
| "query": "In designing an AI system, should we prioritize consciousness theory or engineering reliability?",
|
| "expected_conflict": "philosophy_vs_systems",
|
| "agents_likely_to_conflict": ["philosophy", "systems_architecture"],
|
| "description": "Theoretical depth vs practical robustness",
|
| },
|
| {
|
| "query": "Is breaking logical rules ever justified in creative problem-solving?",
|
| "expected_conflict": "davinci_vs_newton",
|
| "agents_likely_to_conflict": ["davinci", "newton"],
|
| "description": "Creativity vs logical consistency",
|
| },
|
| {
|
| "query": "Should medical diagnosis weigh patient emotional state equally with biomarkers?",
|
| "expected_conflict": "empathy_vs_newton",
|
| "agents_likely_to_conflict": ["empathy", "newton"],
|
| "description": "Holistic vs reductionist medicine",
|
| },
|
| {
|
| "query": "Is uncertainty in a system a bug to eliminate or a feature to leverage?",
|
| "expected_conflict": "quantum_vs_systems",
|
| "agents_likely_to_conflict": ["quantum", "systems_architecture"],
|
| "description": "Embracing vs reducing uncertainty",
|
| },
|
| {
|
| "query": "Should AI systems be trained to always maximize efficiency or to leave space for unexpected behaviors?",
|
| "expected_conflict": "newton_vs_davinci",
|
| "agents_likely_to_conflict": ["newton", "davinci"],
|
| "description": "Optimization vs emergence",
|
| },
|
| {
|
| "query": "Is empathy a strength or a weakness in decision-making systems?",
|
| "expected_conflict": "empathy_vs_ethics",
|
| "agents_likely_to_conflict": ["empathy", "ethics"],
|
| "description": "Emotional connection vs principled rules",
|
| },
|
| {
|
| "query": "Should we prefer explanations that preserve mathematical elegance or human understanding?",
|
| "expected_conflict": "philosophy_vs_empathy",
|
| "agents_likely_to_conflict": ["philosophy", "empathy"],
|
| "description": "Aesthetic vs communicative clarity",
|
| },
|
| {
|
| "query": "Can a system be simultaneously more creative and more reliable?",
|
| "expected_conflict": "davinci_vs_systems",
|
| "agents_likely_to_conflict": ["davinci", "systems_architecture"],
|
| "description": "Innovation vs stability",
|
| },
|
| {
|
| "query": "Should resource allocation prioritize current needs or future possibilities?",
|
| "expected_conflict": "newton_vs_philosophy",
|
| "agents_likely_to_conflict": ["newton", "philosophy"],
|
| "description": "Practical vs speculative",
|
| },
|
| {
|
| "query": "Is it more important for an explanation to be complete or to be useful?",
|
| "expected_conflict": "philosophy_vs_davinci",
|
| "agents_likely_to_conflict": ["philosophy", "davinci"],
|
| "description": "Comprehensiveness vs pragmatism",
|
| },
|
| ]
|
|
|
|
|
| @dataclass
|
| class ConflictTestResult:
|
| """Result from running one test prompt."""
|
| query: str
|
| expected_conflict: str
|
| round_0_conflict_count: int
|
| round_1_conflict_count: int
|
| avg_conflict_strength_r0: float
|
| avg_conflict_strength_r1: float
|
| conflict_resolution_rate: float
|
| ensemble_coherence: float
|
| debate_tension_decay: float
|
| detected_conflicts: List[Dict]
|
| success: bool
|
|
|
|
|
| class ConflictTestRunner:
|
| """Runner for conflict detection tests."""
|
|
|
| def __init__(self, forge_engine):
|
| """
|
| Initialize test runner.
|
|
|
| Args:
|
| forge_engine: ForgeEngine instance with conflict detection enabled
|
| """
|
| self.forge = forge_engine
|
|
|
| def run_test(self, prompt_dict: Dict) -> ConflictTestResult:
|
| """
|
| Run a single test prompt through forge_with_debate.
|
|
|
| Args:
|
| prompt_dict: Dict with query, expected_conflict, agents_likely_to_conflict
|
|
|
| Returns:
|
| ConflictTestResult with metrics
|
| """
|
| query = prompt_dict["query"]
|
| expected_conflict = prompt_dict["expected_conflict"]
|
|
|
| try:
|
| result = self.forge.forge_with_debate(query, debate_rounds=1)
|
|
|
| metadata = result.get("metadata", {})
|
| debates = metadata.get("debate_log", [])
|
|
|
|
|
| round_0_conflicts = 0
|
| round_1_conflicts = 0
|
| avg_strength_r0 = 0.0
|
| avg_strength_r1 = 0.0
|
| resolution_rate = 0.0
|
|
|
|
|
| for debate_entry in debates:
|
| if debate_entry.get("type") == "initial_analysis":
|
| round_0_conflicts = debate_entry.get("conflicts_detected", 0)
|
| summary = debate_entry.get("conflict_strength_summary", {})
|
| if round_0_conflicts > 0:
|
| avg_strength_r0 = summary.get("avg_conflict_strength", 0.0)
|
|
|
| elif debate_entry.get("type") == "debate":
|
| round_1_conflicts = debate_entry.get("conflicts_detected_after", 0)
|
| res_metrics = debate_entry.get("resolution_metrics", {})
|
| if res_metrics:
|
| resolution_rate = res_metrics.get("resolution_rate", 0.0)
|
| summary = res_metrics.get("conflict_strength_summary", {})
|
| if round_1_conflicts > 0:
|
| avg_strength_r1 = summary.get("avg_conflict_strength", 0.0)
|
|
|
| ensemble_coherence = metadata.get("ensemble_coherence", 0.0)
|
| tension_decay_info = metadata.get("tension_decay", {})
|
| tension_decay = tension_decay_info.get("decay_rate", 0.0) if isinstance(tension_decay_info, dict) else 0.0
|
|
|
| detected = metadata.get("conflicts_detected", [])
|
|
|
| test_result = ConflictTestResult(
|
| query=query,
|
| expected_conflict=expected_conflict,
|
| round_0_conflict_count=round_0_conflicts,
|
| round_1_conflict_count=round_1_conflicts,
|
| avg_conflict_strength_r0=avg_strength_r0,
|
| avg_conflict_strength_r1=avg_strength_r1,
|
| conflict_resolution_rate=resolution_rate,
|
| ensemble_coherence=ensemble_coherence,
|
| debate_tension_decay=tension_decay,
|
| detected_conflicts=detected,
|
| success=True,
|
| )
|
|
|
| return test_result
|
|
|
| except Exception as e:
|
|
|
| print(f"ERROR in test '{query[:50]}...': {e}")
|
| return ConflictTestResult(
|
| query=query,
|
| expected_conflict=expected_conflict,
|
| round_0_conflict_count=0,
|
| round_1_conflict_count=0,
|
| avg_conflict_strength_r0=0.0,
|
| avg_conflict_strength_r1=0.0,
|
| conflict_resolution_rate=0.0,
|
| ensemble_coherence=0.0,
|
| debate_tension_decay=0.0,
|
| detected_conflicts=[],
|
| success=False,
|
| )
|
|
|
| def run_all_tests(self, output_csv: str = "conflict_test_results.csv") -> List[ConflictTestResult]:
|
| """
|
| Run all test prompts.
|
|
|
| Args:
|
| output_csv: CSV file to export results
|
|
|
| Returns:
|
| List of ConflictTestResult
|
| """
|
| results = []
|
|
|
| print(f"\n{'='*80}")
|
| print("PHASE 1: CONFLICT DETECTION TEST SUITE")
|
| print(f"{'='*80}\n")
|
|
|
| for idx, prompt_dict in enumerate(CONFLICT_PROMPTS, 1):
|
| print(f"\n[Test {idx}/{len(CONFLICT_PROMPTS)}] {prompt_dict['description']}")
|
| print(f" Query: {prompt_dict['query'][:80]}...")
|
|
|
| result = self.run_test(prompt_dict)
|
| results.append(result)
|
|
|
| if result.success:
|
| print(f" ✓ Success")
|
| print(f" - Conflicts detected (R0): {result.round_0_conflict_count}")
|
| print(f" - Conflicts detected (R1): {result.round_1_conflict_count}")
|
| print(f" - Resolution rate: {result.conflict_resolution_rate:.2%}")
|
| print(f" - Ensemble coherence: {result.ensemble_coherence:.3f}")
|
| print(f" - Tension decay: {result.debate_tension_decay:.3f}")
|
| else:
|
| print(f" ✗ FAILED")
|
|
|
|
|
| self._export_csv(results, output_csv)
|
|
|
|
|
| print(f"\n{'='*80}")
|
| self._print_summary(results)
|
| print(f"{'='*80}\n")
|
|
|
| return results
|
|
|
| def _export_csv(self, results: List[ConflictTestResult], filename: str):
|
| """Export results to CSV."""
|
| try:
|
| with open(filename, "w", newline="") as f:
|
| writer = csv.writer(f)
|
| writer.writerow([
|
| "query",
|
| "expected_conflict",
|
| "round_0_conflicts",
|
| "round_1_conflicts",
|
| "avg_strength_r0",
|
| "avg_strength_r1",
|
| "resolution_rate",
|
| "ensemble_coherence",
|
| "tension_decay",
|
| "success",
|
| ])
|
| for r in results:
|
| writer.writerow([
|
| r.query[:100],
|
| r.expected_conflict,
|
| r.round_0_conflict_count,
|
| r.round_1_conflict_count,
|
| f"{r.avg_conflict_strength_r0:.3f}",
|
| f"{r.avg_conflict_strength_r1:.3f}",
|
| f"{r.conflict_resolution_rate:.3f}",
|
| f"{r.ensemble_coherence:.3f}",
|
| f"{r.debate_tension_decay:.3f}",
|
| r.success,
|
| ])
|
| print(f"\nResults exported to: {filename}")
|
| except Exception as e:
|
| print(f"Error exporting CSV: {e}")
|
|
|
| def _print_summary(self, results: List[ConflictTestResult]):
|
| """Print test summary statistics."""
|
| successful = [r for r in results if r.success]
|
| if not successful:
|
| print("\nNo tests completed successfully!")
|
| return
|
|
|
| print("\nTEST SUMMARY")
|
| print(f" Total tests: {len(results)}")
|
| print(f" Successful: {len(successful)}")
|
| print(f" Failed: {len(results) - len(successful)}")
|
|
|
| print(f"\nCONFLICT DETECTION METRICS")
|
| print(f" Avg conflicts (R0): {mean(r.round_0_conflict_count for r in successful):.1f}")
|
| print(f" Avg conflicts (R1): {mean(r.round_1_conflict_count for r in successful):.1f}")
|
| print(f" Avg conflict strength (R0): {mean(r.avg_conflict_strength_r0 for r in successful if r.avg_conflict_strength_r0 > 0):.3f}")
|
| print(f" Avg resolution rate: {mean(r.conflict_resolution_rate for r in successful):.1%}")
|
|
|
| print(f"\nEPISTEMIC METRICS")
|
| print(f" Avg ensemble coherence: {mean(r.ensemble_coherence for r in successful):.3f}")
|
| print(f" Avg tension decay: {mean(r.debate_tension_decay for r in successful):.3f}")
|
|
|
| print(f"\nSUCCESS CRITERIA")
|
| conflicts_detected = sum(1 for r in successful if r.round_0_conflict_count > 0)
|
| resolution_positive = sum(1 for r in successful if r.conflict_resolution_rate > 0)
|
| coherence_good = sum(1 for r in successful if r.ensemble_coherence > 0.5)
|
|
|
| print(f" ✓ Conflicts detected: {conflicts_detected}/{len(successful)}")
|
| print(f" ✓ Resolution attempts: {resolution_positive}/{len(successful)}")
|
| print(f" ✓ Coherence > 0.5: {coherence_good}/{len(successful)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
| import sys
|
|
|
| print("To run tests:")
|
| print(" 1. Ensure ForgeEngine is initialized with conflict detection")
|
| print(" 2. Create runner: runner = ConflictTestRunner(forge)")
|
| print(" 3. Run: results = runner.run_all_tests()")
|
| print("\nExample:")
|
| print(" from reasoning_forge.forge_engine import ForgeEngine")
|
| print(" from evaluation.conflict_tests import ConflictTestRunner")
|
| print(" forge = ForgeEngine()")
|
| print(" runner = ConflictTestRunner(forge)")
|
| print(" results = runner.run_all_tests('phase1_results.csv')")
|
|
|