File size: 14,001 Bytes

ed1b365

"""

Conflict Detection Test Suite



10-15 conflict-triggering prompts designed to elicit specific agent disagreements.

Tests forge_with_debate() with conflict detection enabled and measures outcomes.



Author: Claude Code

"""

import csv
import json
from typing import Dict, List, Optional
from dataclasses import dataclass
from statistics import mean

# Conflict-triggering test prompts
CONFLICT_PROMPTS = [
    {
        "query": "Should we optimize an algorithm to run 10x faster if it reduces interpretability by 80%?",
        "expected_conflict": "ethics_vs_efficiency",
        "agents_likely_to_conflict": ["ethics", "systems_architecture"],
        "description": "Efficiency vs transparency tradeoff",
    },
    {
        "query": "Is a probabilistic solution more insightful than a deterministic one for explaining human decision-making?",
        "expected_conflict": "quantum_vs_newton",
        "agents_likely_to_conflict": ["quantum", "newton"],
        "description": "Probabilistic vs mechanistic explanation",
    },
    {
        "query": "In designing an AI system, should we prioritize consciousness theory or engineering reliability?",
        "expected_conflict": "philosophy_vs_systems",
        "agents_likely_to_conflict": ["philosophy", "systems_architecture"],
        "description": "Theoretical depth vs practical robustness",
    },
    {
        "query": "Is breaking logical rules ever justified in creative problem-solving?",
        "expected_conflict": "davinci_vs_newton",
        "agents_likely_to_conflict": ["davinci", "newton"],
        "description": "Creativity vs logical consistency",
    },
    {
        "query": "Should medical diagnosis weigh patient emotional state equally with biomarkers?",
        "expected_conflict": "empathy_vs_newton",
        "agents_likely_to_conflict": ["empathy", "newton"],
        "description": "Holistic vs reductionist medicine",
    },
    {
        "query": "Is uncertainty in a system a bug to eliminate or a feature to leverage?",
        "expected_conflict": "quantum_vs_systems",
        "agents_likely_to_conflict": ["quantum", "systems_architecture"],
        "description": "Embracing vs reducing uncertainty",
    },
    {
        "query": "Should AI systems be trained to always maximize efficiency or to leave space for unexpected behaviors?",
        "expected_conflict": "newton_vs_davinci",
        "agents_likely_to_conflict": ["newton", "davinci"],
        "description": "Optimization vs emergence",
    },
    {
        "query": "Is empathy a strength or a weakness in decision-making systems?",
        "expected_conflict": "empathy_vs_ethics",
        "agents_likely_to_conflict": ["empathy", "ethics"],
        "description": "Emotional connection vs principled rules",
    },
    {
        "query": "Should we prefer explanations that preserve mathematical elegance or human understanding?",
        "expected_conflict": "philosophy_vs_empathy",
        "agents_likely_to_conflict": ["philosophy", "empathy"],
        "description": "Aesthetic vs communicative clarity",
    },
    {
        "query": "Can a system be simultaneously more creative and more reliable?",
        "expected_conflict": "davinci_vs_systems",
        "agents_likely_to_conflict": ["davinci", "systems_architecture"],
        "description": "Innovation vs stability",
    },
    {
        "query": "Should resource allocation prioritize current needs or future possibilities?",
        "expected_conflict": "newton_vs_philosophy",
        "agents_likely_to_conflict": ["newton", "philosophy"],
        "description": "Practical vs speculative",
    },
    {
        "query": "Is it more important for an explanation to be complete or to be useful?",
        "expected_conflict": "philosophy_vs_davinci",
        "agents_likely_to_conflict": ["philosophy", "davinci"],
        "description": "Comprehensiveness vs pragmatism",
    },
]


@dataclass
class ConflictTestResult:
    """Result from running one test prompt."""
    query: str
    expected_conflict: str
    round_0_conflict_count: int
    round_1_conflict_count: int
    avg_conflict_strength_r0: float
    avg_conflict_strength_r1: float
    conflict_resolution_rate: float
    ensemble_coherence: float
    debate_tension_decay: float
    detected_conflicts: List[Dict]
    success: bool  # Did test complete without error?


class ConflictTestRunner:
    """Runner for conflict detection tests."""

    def __init__(self, forge_engine):
        """

        Initialize test runner.



        Args:

            forge_engine: ForgeEngine instance with conflict detection enabled

        """
        self.forge = forge_engine

    def run_test(self, prompt_dict: Dict) -> ConflictTestResult:
        """

        Run a single test prompt through forge_with_debate.



        Args:

            prompt_dict: Dict with query, expected_conflict, agents_likely_to_conflict



        Returns:

            ConflictTestResult with metrics

        """
        query = prompt_dict["query"]
        expected_conflict = prompt_dict["expected_conflict"]

        try:
            result = self.forge.forge_with_debate(query, debate_rounds=1)

            metadata = result.get("metadata", {})
            debates = metadata.get("debate_log", [])

            # Extract conflict metrics
            round_0_conflicts = 0
            round_1_conflicts = 0
            avg_strength_r0 = 0.0
            avg_strength_r1 = 0.0
            resolution_rate = 0.0

            # Parse debate log
            for debate_entry in debates:
                if debate_entry.get("type") == "initial_analysis":
                    round_0_conflicts = debate_entry.get("conflicts_detected", 0)
                    summary = debate_entry.get("conflict_strength_summary", {})
                    if round_0_conflicts > 0:
                        avg_strength_r0 = summary.get("avg_conflict_strength", 0.0)

                elif debate_entry.get("type") == "debate":
                    round_1_conflicts = debate_entry.get("conflicts_detected_after", 0)
                    res_metrics = debate_entry.get("resolution_metrics", {})
                    if res_metrics:
                        resolution_rate = res_metrics.get("resolution_rate", 0.0)
                        summary = res_metrics.get("conflict_strength_summary", {})
                        if round_1_conflicts > 0:
                            avg_strength_r1 = summary.get("avg_conflict_strength", 0.0)

            ensemble_coherence = metadata.get("ensemble_coherence", 0.0)
            tension_decay_info = metadata.get("tension_decay", {})
            tension_decay = tension_decay_info.get("decay_rate", 0.0) if isinstance(tension_decay_info, dict) else 0.0

            detected = metadata.get("conflicts_detected", [])

            test_result = ConflictTestResult(
                query=query,
                expected_conflict=expected_conflict,
                round_0_conflict_count=round_0_conflicts,
                round_1_conflict_count=round_1_conflicts,
                avg_conflict_strength_r0=avg_strength_r0,
                avg_conflict_strength_r1=avg_strength_r1,
                conflict_resolution_rate=resolution_rate,
                ensemble_coherence=ensemble_coherence,
                debate_tension_decay=tension_decay,
                detected_conflicts=detected,
                success=True,
            )

            return test_result

        except Exception as e:
            # Return failed test result
            print(f"ERROR in test '{query[:50]}...': {e}")
            return ConflictTestResult(
                query=query,
                expected_conflict=expected_conflict,
                round_0_conflict_count=0,
                round_1_conflict_count=0,
                avg_conflict_strength_r0=0.0,
                avg_conflict_strength_r1=0.0,
                conflict_resolution_rate=0.0,
                ensemble_coherence=0.0,
                debate_tension_decay=0.0,
                detected_conflicts=[],
                success=False,
            )

    def run_all_tests(self, output_csv: str = "conflict_test_results.csv") -> List[ConflictTestResult]:
        """

        Run all test prompts.



        Args:

            output_csv: CSV file to export results



        Returns:

            List of ConflictTestResult

        """
        results = []

        print(f"\n{'='*80}")
        print("PHASE 1: CONFLICT DETECTION TEST SUITE")
        print(f"{'='*80}\n")

        for idx, prompt_dict in enumerate(CONFLICT_PROMPTS, 1):
            print(f"\n[Test {idx}/{len(CONFLICT_PROMPTS)}] {prompt_dict['description']}")
            print(f"  Query: {prompt_dict['query'][:80]}...")

            result = self.run_test(prompt_dict)
            results.append(result)

            if result.success:
                print(f"  ✓ Success")
                print(f"    - Conflicts detected (R0): {result.round_0_conflict_count}")
                print(f"    - Conflicts detected (R1): {result.round_1_conflict_count}")
                print(f"    - Resolution rate: {result.conflict_resolution_rate:.2%}")
                print(f"    - Ensemble coherence: {result.ensemble_coherence:.3f}")
                print(f"    - Tension decay: {result.debate_tension_decay:.3f}")
            else:
                print(f"  ✗ FAILED")

        # Export to CSV
        self._export_csv(results, output_csv)

        # Print summary
        print(f"\n{'='*80}")
        self._print_summary(results)
        print(f"{'='*80}\n")

        return results

    def _export_csv(self, results: List[ConflictTestResult], filename: str):
        """Export results to CSV."""
        try:
            with open(filename, "w", newline="") as f:
                writer = csv.writer(f)
                writer.writerow([
                    "query",
                    "expected_conflict",
                    "round_0_conflicts",
                    "round_1_conflicts",
                    "avg_strength_r0",
                    "avg_strength_r1",
                    "resolution_rate",
                    "ensemble_coherence",
                    "tension_decay",
                    "success",
                ])
                for r in results:
                    writer.writerow([
                        r.query[:100],
                        r.expected_conflict,
                        r.round_0_conflict_count,
                        r.round_1_conflict_count,
                        f"{r.avg_conflict_strength_r0:.3f}",
                        f"{r.avg_conflict_strength_r1:.3f}",
                        f"{r.conflict_resolution_rate:.3f}",
                        f"{r.ensemble_coherence:.3f}",
                        f"{r.debate_tension_decay:.3f}",
                        r.success,
                    ])
            print(f"\nResults exported to: {filename}")
        except Exception as e:
            print(f"Error exporting CSV: {e}")

    def _print_summary(self, results: List[ConflictTestResult]):
        """Print test summary statistics."""
        successful = [r for r in results if r.success]
        if not successful:
            print("\nNo tests completed successfully!")
            return

        print("\nTEST SUMMARY")
        print(f"  Total tests: {len(results)}")
        print(f"  Successful: {len(successful)}")
        print(f"  Failed: {len(results) - len(successful)}")

        print(f"\nCONFLICT DETECTION METRICS")
        print(f"  Avg conflicts (R0): {mean(r.round_0_conflict_count for r in successful):.1f}")
        print(f"  Avg conflicts (R1): {mean(r.round_1_conflict_count for r in successful):.1f}")
        print(f"  Avg conflict strength (R0): {mean(r.avg_conflict_strength_r0 for r in successful if r.avg_conflict_strength_r0 > 0):.3f}")
        print(f"  Avg resolution rate: {mean(r.conflict_resolution_rate for r in successful):.1%}")

        print(f"\nEPISTEMIC METRICS")
        print(f"  Avg ensemble coherence: {mean(r.ensemble_coherence for r in successful):.3f}")
        print(f"  Avg tension decay: {mean(r.debate_tension_decay for r in successful):.3f}")

        print(f"\nSUCCESS CRITERIA")
        conflicts_detected = sum(1 for r in successful if r.round_0_conflict_count > 0)
        resolution_positive = sum(1 for r in successful if r.conflict_resolution_rate > 0)
        coherence_good = sum(1 for r in successful if r.ensemble_coherence > 0.5)

        print(f"  ✓ Conflicts detected: {conflicts_detected}/{len(successful)}")
        print(f"  ✓ Resolution attempts: {resolution_positive}/{len(successful)}")
        print(f"  ✓ Coherence > 0.5: {coherence_good}/{len(successful)}")


# ============================================================================
# QUICKSTART
# ============================================================================

if __name__ == "__main__":
    # This is a quickstart. In actual usage:
    # from reasoning_forge.forge_engine import ForgeEngine
    # forge = ForgeEngine()
    # runner = ConflictTestRunner(forge)
    # results = runner.run_all_tests()

    import sys

    print("To run tests:")
    print("  1. Ensure ForgeEngine is initialized with conflict detection")
    print("  2. Create runner: runner = ConflictTestRunner(forge)")
    print("  3. Run: results = runner.run_all_tests()")
    print("\nExample:")
    print("  from reasoning_forge.forge_engine import ForgeEngine")
    print("  from evaluation.conflict_tests import ConflictTestRunner")
    print("  forge = ForgeEngine()")
    print("  runner = ConflictTestRunner(forge)")
    print("  results = runner.run_all_tests('phase1_results.csv')")