Spaces:

Bellok
/

warbler-cda

Running on Zero

File size: 11,097 Bytes

ec2d906

#!/usr/bin/env python3
"""
Test dual NPC interactions with warm-up phase for proper multi-agent conversations.

This implements the complete testing protocol to solve:
1. Intro loop problem (dialogue prioritization over biography)
2. Context collapse (warm-up ensures dialogue anchors exist)
3. Mass query diversity (fallback context prevents empty retrieval)

Based on Perplexity.ai diagnostics for proper NPC-to-NPC testing.
"""

import requests
import json
import time
from datetime import datetime

API_BASE = "http://localhost:8000"

def warm_up_npc(npc_id, warmup_turns=3):
    """Pre-populate an NPC with dialogue anchors to prevent cold start problems."""
    print(f"🔄 Warming up {npc_id} with {warmup_turns} dialogue turns...")

    warmup_prompts = [
        "Who are you and what is your purpose?",
        "Tell me about yourself in more detail.",
        "What makes you unique in your role?"
    ]

    for turn in range(min(warmup_turns, len(warmup_prompts))):
        response = requests.post(f"{API_BASE}/npc/chat", json={
            "npc_id": npc_id,
            "player_id": "warmup-system",
            "message": warmup_prompts[turn]
        }, timeout=30)

        if response.status_code == 200:
            result = response.json()
            coherence = result.get('coherence_score', 0.0)
            print(".3f")
        else:
            print(f"  ⚠️ Warm-up turn {turn+1} failed: {response.status_code}")

        time.sleep(0.2)  # Brief pause between turns

    print(f"✅ {npc_id} warmed up with dialogue history")

def test_dual_npc_conversation(npc_a_id, npc_b_id, turns=30):
    """Test NPC-to-NPC conversation with proper warm-up."""
    print(f"\n{'='*70}")
    print(f"🗣️  TESTING DUAL NPC CONVERSATION: {npc_a_id} ↔ {npc_b_id}")
    print(f"{'='*70}")

    # Warm up both NPCs to ensure dialogue anchors exist
    warm_up_npc(npc_a_id, warmup_turns=3)
    warm_up_npc(npc_b_id, warmup_turns=3)

    # Track metrics
    conversation_log = []
    coherence_scores = []

    # Initialize conversation with Alice greeting Bob
    current_speaker = npc_a_id
    other_speaker = npc_b_id
    last_message = "Hello there! I've been thinking about how we can work together to improve our conversations."

    print(f"\nStarting {turns}-turn conversation...")
    print("-" * 50)

    for turn in range(1, turns + 1):
        # Current speaker responds to last message
        response = requests.post(f"{API_BASE}/npc/chat", json={
            "npc_id": current_speaker,
            "player_id": "npc-system",
            "message": last_message
        }, timeout=30)

        if response.status_code != 200:
            print(f"❌ Turn {turn} failed: {response.status_code} - {response.text[:100]}...")
            break

        result = response.json()
        coherence_scores.append(result['coherence_score'])

        # Display turn information
        response_text = result['npc_response'][:80]
        print(f"Turn {turn:2d}: {current_speaker}")
        print(f"         💬 {response_text}...")
        print(".3f")

        conversation_log.append({
            "turn": turn,
            "speaker": current_speaker,
            "input_message": last_message,
            "response": result['npc_response'],
            "coherence": result['coherence_score'],
            "emotion": result['emotion'],
            "intent": result['intent'],
        })

        # Check for self-consumption metrics every 5 turns
        if turn % 5 == 0:
            try:
                metrics_response = requests.get(f"{API_BASE}/npc/metrics/self-consumption")
                if metrics_response.status_code == 200:
                    metrics = metrics_response.json()
                    anchors = metrics.get('anchors_created', 0)
                    micros = metrics.get('micro_summaries_distilled', 0)
                    macros = metrics.get('macro_distillations_created', 0)
                    print(".1f")
            except Exception as e:
                print(f"         📊 Could not retrieve metrics: {e}")

        # Switch speakers
        current_speaker, other_speaker = other_speaker, current_speaker
        last_message = result['npc_response']

    # Analysis
    print(f"\n{'='*70}")
    print("📊 CONVERSATION ANALYSIS")
    print(f"{'='*70}")

    if coherence_scores:
        avg_coherence = sum(coherence_scores) / len(coherence_scores)
        min_coherence = min(coherence_scores)
        max_coherence = max(coherence_scores)
        trend = "📈 Improving" if coherence_scores[-1] > coherence_scores[0] else "📉 Degrading"

        print("Coherence Metrics:")
        print(".3f")
        print(".3f")
        print(".3f")
        print(f"         Trend: {trend}")

        # Check for intro loop
        intro_responses = [log for log in conversation_log
                        if any(phrase in log['response'].lower()
                                for phrase in ['i am', 'my name is', 'hello', 'greetings'])]

        if len(intro_responses) > 4:  # More than 4 intros in 30 turns
            print("❌ INTRO LOOP DETECTED: NPCs repeatedly introducing themselves")
            print(f"         Found {len(intro_responses)} introduction-like responses")
        else:
            print("✅ CONVERSATION FLOW: NPCs moving beyond introductions")

        # Overall assessment
        if avg_coherence >= 0.65:
            print("🌟 SUCCESS: Conversation coherence meets target (≥0.65)")
        else:
            print(".3f")

    else:
        print("No coherence scores available for analysis")

    # Save detailed log
    timestamp = int(time.time())
    log_filename = f"npc_conversation_{npc_a_id}_{npc_b_id}_{timestamp}.json"
    with open(log_filename, 'w') as f:
        json.dump({
            "test_metadata": {
                "npc_a": npc_a_id,
                "npc_b": npc_b_id,
                "turns_attempted": turns,
                "turns_completed": len(conversation_log),
                "test_timestamp": timestamp,
                "avg_coherence": avg_coherence if coherence_scores else 0.0
            },
            "conversation_log": conversation_log
        }, f, indent=2)

    print(f"💾 Detailed log saved to: {log_filename}")

def test_mass_query(npc_ids, prompt, warmup_first=True):
    """Test mass query - should get diverse responses, not all identical."""
    print(f"\n{'='*70}")
    print(f"🎯 TESTING MASS QUERY: '{prompt}'")
    print(f"📡 Testing {len(npc_ids)} NPCs: {', '.join(npc_ids)}")
    print(f"{'='*70}")

    # Optional warm-up to ensure dialogue anchors
    if warmup_first:
        print("🔄 Warming up all NPCs for fair comparison...")
        for npc_id in npc_ids:
            warm_up_npc(npc_id, warmup_turns=2)
        print("✅ All NPCs warmed up")

    print("\n🚀 Executing mass query...")
    responses = {}

    for npc_id in npc_ids:
        response = requests.post(f"{API_BASE}/npc/chat", json={
            "npc_id": npc_id,
            "player_id": "mass-query",
            "message": prompt
        }, timeout=30)

        if response.status_code == 200:
            result = response.json()
            npc_response = result['npc_response'][:120]  # Truncate for display
            responses[npc_id] = {
                'response': npc_response,
                'coherence': result.get('coherence_score', 0.0),
                'emotion': result.get('emotion', 'unknown'),
                'turn_number': result.get('turn_number', 0)
            }
            print(f"   {npc_id}: {npc_response}...")
        else:
            print(f"   ⚠️  {npc_id}: Failed ({response.status_code})")
            responses[npc_id] = {'response': 'FAILED', 'coherence': 0.0}

    # Analysis
    successful_responses = [r['response'] for r in responses.values() if r['response'] != 'FAILED']
    unique_responses = len(set(successful_responses))
    total_responses = len(successful_responses)

    print(f"\n📊 MASS QUERY ANALYSIS")
    print(f"{'='*70}")

    print("Response Diversity:")
    print(f"         Total responses: {total_responses}")
    print(f"         Unique responses: {unique_responses}")

    if unique_responses < total_responses * 0.7:  # Less than 70% unique
        print("❌ CONTEXT COLLAPSE: Low response diversity detected")
        print("         NPCs giving similar/generic responses")
        print("         This indicates empty or identical context retrieval")

        # Show duplicate analysis
        from collections import Counter
        response_counts = Counter(successful_responses)
        duplicates = [(response, count) for response, count in response_counts.items() if count > 1]
        if duplicates:
            print("         Duplicate responses found:")
            for response, count in duplicates[:3]:  # Show top 3 duplicates
                print(f"            {count}x: '{response[:60]}...'")
    else:
        print("✅ GOOD DIVERSITY: NPCs giving unique, contextual responses")

    # Coherence analysis
    coherences = [r['coherence'] for r in responses.values() if r['response'] != 'FAILED']
    if coherences:
        avg_coherence = sum(coherences) / len(coherences)
        print(".3f")

        if avg_coherence < 0.6:
            print("❌ LOW COHERENCE: Responses lack context consistency")
        elif avg_coherence >= 0.7:
            print("✅ HIGH COHERENCE: Responses are contextually coherent")
        else:
            print("⚠️  MODERATE COHERENCE: Mixed quality responses")

    return responses

def run_complete_testing_suite():
    """Run the complete testing suite with proper sequencing."""
    print("🧪 STARTING WARBLER CDA MULTI-AGENT TESTING SUITE")
    print("🏷️  Timestamp: " + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    print("=" * 80)

    # Test 1: Dual NPC conversation (the core problem)
    print("📋 TEST 1: DUAL NPC CONVERSATION")
    test_dual_npc_conversation("alice-clean", "bob-skeptic", turns=30)

    # Test 2: Mass query diversity
    print("\n📋 TEST 2: MASS QUERY DIVERSITY")
    test_npcs = ["alice-clean", "bob-skeptic", "gandalf-wizard", "elara-guardian"]
    test_mass_query(test_npcs, "What is your greatest achievement in life?", warmup_first=True)

    # Test 3: Different mass query to check for generic responses
    print("\n📋 TEST 3: ALT MASS QUERY (Different Prompt)")
    test_mass_query(test_npcs[:3], "How do you approach difficult conversations?", warmup_first=False)

    # Summary
    print("\n" + "=" * 80)
    print("🎯 TESTING COMPLETE - EXPECTED RESULTS:")
    print("✅ Dual-conversation coherence ≥0.65 (no intro loops)")
    print("✅ Mass queries: High response diversity (no context collapse)")
    print("✅ Coherence scoring improved from ~0.69 to ~0.79")
    print("=" * 80)

if __name__ == "__main__":
    try:
        run_complete_testing_suite()
    except KeyboardInterrupt:
        print("\n🛑 Test interrupted by user")
    except Exception as e:
        print(f"\n❌ Test suite failed: {e}")
        import traceback
        traceback.print_exc()