#!/usr/bin/env python3 """ Test dual NPC interactions with warm-up phase for proper multi-agent conversations. This implements the complete testing protocol to solve: 1. Intro loop problem (dialogue prioritization over biography) 2. Context collapse (warm-up ensures dialogue anchors exist) 3. Mass query diversity (fallback context prevents empty retrieval) Based on Perplexity.ai diagnostics for proper NPC-to-NPC testing. """ import requests import json import time from datetime import datetime API_BASE = "http://localhost:8000" def warm_up_npc(npc_id, warmup_turns=3): """Pre-populate an NPC with dialogue anchors to prevent cold start problems.""" print(f"๐Ÿ”„ Warming up {npc_id} with {warmup_turns} dialogue turns...") warmup_prompts = [ "Who are you and what is your purpose?", "Tell me about yourself in more detail.", "What makes you unique in your role?" ] for turn in range(min(warmup_turns, len(warmup_prompts))): response = requests.post(f"{API_BASE}/npc/chat", json={ "npc_id": npc_id, "player_id": "warmup-system", "message": warmup_prompts[turn] }, timeout=30) if response.status_code == 200: result = response.json() coherence = result.get('coherence_score', 0.0) print(".3f") else: print(f" โš ๏ธ Warm-up turn {turn+1} failed: {response.status_code}") time.sleep(0.2) # Brief pause between turns print(f"โœ… {npc_id} warmed up with dialogue history") def test_dual_npc_conversation(npc_a_id, npc_b_id, turns=30): """Test NPC-to-NPC conversation with proper warm-up.""" print(f"\n{'='*70}") print(f"๐Ÿ—ฃ๏ธ TESTING DUAL NPC CONVERSATION: {npc_a_id} โ†” {npc_b_id}") print(f"{'='*70}") # Warm up both NPCs to ensure dialogue anchors exist warm_up_npc(npc_a_id, warmup_turns=3) warm_up_npc(npc_b_id, warmup_turns=3) # Track metrics conversation_log = [] coherence_scores = [] # Initialize conversation with Alice greeting Bob current_speaker = npc_a_id other_speaker = npc_b_id last_message = "Hello there! I've been thinking about how we can work together to improve our conversations." print(f"\nStarting {turns}-turn conversation...") print("-" * 50) for turn in range(1, turns + 1): # Current speaker responds to last message response = requests.post(f"{API_BASE}/npc/chat", json={ "npc_id": current_speaker, "player_id": "npc-system", "message": last_message }, timeout=30) if response.status_code != 200: print(f"โŒ Turn {turn} failed: {response.status_code} - {response.text[:100]}...") break result = response.json() coherence_scores.append(result['coherence_score']) # Display turn information response_text = result['npc_response'][:80] print(f"Turn {turn:2d}: {current_speaker}") print(f" ๐Ÿ’ฌ {response_text}...") print(".3f") conversation_log.append({ "turn": turn, "speaker": current_speaker, "input_message": last_message, "response": result['npc_response'], "coherence": result['coherence_score'], "emotion": result['emotion'], "intent": result['intent'], }) # Check for self-consumption metrics every 5 turns if turn % 5 == 0: try: metrics_response = requests.get(f"{API_BASE}/npc/metrics/self-consumption") if metrics_response.status_code == 200: metrics = metrics_response.json() anchors = metrics.get('anchors_created', 0) micros = metrics.get('micro_summaries_distilled', 0) macros = metrics.get('macro_distillations_created', 0) print(".1f") except Exception as e: print(f" ๐Ÿ“Š Could not retrieve metrics: {e}") # Switch speakers current_speaker, other_speaker = other_speaker, current_speaker last_message = result['npc_response'] # Analysis print(f"\n{'='*70}") print("๐Ÿ“Š CONVERSATION ANALYSIS") print(f"{'='*70}") if coherence_scores: avg_coherence = sum(coherence_scores) / len(coherence_scores) min_coherence = min(coherence_scores) max_coherence = max(coherence_scores) trend = "๐Ÿ“ˆ Improving" if coherence_scores[-1] > coherence_scores[0] else "๐Ÿ“‰ Degrading" print("Coherence Metrics:") print(".3f") print(".3f") print(".3f") print(f" Trend: {trend}") # Check for intro loop intro_responses = [log for log in conversation_log if any(phrase in log['response'].lower() for phrase in ['i am', 'my name is', 'hello', 'greetings'])] if len(intro_responses) > 4: # More than 4 intros in 30 turns print("โŒ INTRO LOOP DETECTED: NPCs repeatedly introducing themselves") print(f" Found {len(intro_responses)} introduction-like responses") else: print("โœ… CONVERSATION FLOW: NPCs moving beyond introductions") # Overall assessment if avg_coherence >= 0.65: print("๐ŸŒŸ SUCCESS: Conversation coherence meets target (โ‰ฅ0.65)") else: print(".3f") else: print("No coherence scores available for analysis") # Save detailed log timestamp = int(time.time()) log_filename = f"npc_conversation_{npc_a_id}_{npc_b_id}_{timestamp}.json" with open(log_filename, 'w') as f: json.dump({ "test_metadata": { "npc_a": npc_a_id, "npc_b": npc_b_id, "turns_attempted": turns, "turns_completed": len(conversation_log), "test_timestamp": timestamp, "avg_coherence": avg_coherence if coherence_scores else 0.0 }, "conversation_log": conversation_log }, f, indent=2) print(f"๐Ÿ’พ Detailed log saved to: {log_filename}") def test_mass_query(npc_ids, prompt, warmup_first=True): """Test mass query - should get diverse responses, not all identical.""" print(f"\n{'='*70}") print(f"๐ŸŽฏ TESTING MASS QUERY: '{prompt}'") print(f"๐Ÿ“ก Testing {len(npc_ids)} NPCs: {', '.join(npc_ids)}") print(f"{'='*70}") # Optional warm-up to ensure dialogue anchors if warmup_first: print("๐Ÿ”„ Warming up all NPCs for fair comparison...") for npc_id in npc_ids: warm_up_npc(npc_id, warmup_turns=2) print("โœ… All NPCs warmed up") print("\n๐Ÿš€ Executing mass query...") responses = {} for npc_id in npc_ids: response = requests.post(f"{API_BASE}/npc/chat", json={ "npc_id": npc_id, "player_id": "mass-query", "message": prompt }, timeout=30) if response.status_code == 200: result = response.json() npc_response = result['npc_response'][:120] # Truncate for display responses[npc_id] = { 'response': npc_response, 'coherence': result.get('coherence_score', 0.0), 'emotion': result.get('emotion', 'unknown'), 'turn_number': result.get('turn_number', 0) } print(f" {npc_id}: {npc_response}...") else: print(f" โš ๏ธ {npc_id}: Failed ({response.status_code})") responses[npc_id] = {'response': 'FAILED', 'coherence': 0.0} # Analysis successful_responses = [r['response'] for r in responses.values() if r['response'] != 'FAILED'] unique_responses = len(set(successful_responses)) total_responses = len(successful_responses) print(f"\n๐Ÿ“Š MASS QUERY ANALYSIS") print(f"{'='*70}") print("Response Diversity:") print(f" Total responses: {total_responses}") print(f" Unique responses: {unique_responses}") if unique_responses < total_responses * 0.7: # Less than 70% unique print("โŒ CONTEXT COLLAPSE: Low response diversity detected") print(" NPCs giving similar/generic responses") print(" This indicates empty or identical context retrieval") # Show duplicate analysis from collections import Counter response_counts = Counter(successful_responses) duplicates = [(response, count) for response, count in response_counts.items() if count > 1] if duplicates: print(" Duplicate responses found:") for response, count in duplicates[:3]: # Show top 3 duplicates print(f" {count}x: '{response[:60]}...'") else: print("โœ… GOOD DIVERSITY: NPCs giving unique, contextual responses") # Coherence analysis coherences = [r['coherence'] for r in responses.values() if r['response'] != 'FAILED'] if coherences: avg_coherence = sum(coherences) / len(coherences) print(".3f") if avg_coherence < 0.6: print("โŒ LOW COHERENCE: Responses lack context consistency") elif avg_coherence >= 0.7: print("โœ… HIGH COHERENCE: Responses are contextually coherent") else: print("โš ๏ธ MODERATE COHERENCE: Mixed quality responses") return responses def run_complete_testing_suite(): """Run the complete testing suite with proper sequencing.""" print("๐Ÿงช STARTING WARBLER CDA MULTI-AGENT TESTING SUITE") print("๐Ÿท๏ธ Timestamp: " + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print("=" * 80) # Test 1: Dual NPC conversation (the core problem) print("๐Ÿ“‹ TEST 1: DUAL NPC CONVERSATION") test_dual_npc_conversation("alice-clean", "bob-skeptic", turns=30) # Test 2: Mass query diversity print("\n๐Ÿ“‹ TEST 2: MASS QUERY DIVERSITY") test_npcs = ["alice-clean", "bob-skeptic", "gandalf-wizard", "elara-guardian"] test_mass_query(test_npcs, "What is your greatest achievement in life?", warmup_first=True) # Test 3: Different mass query to check for generic responses print("\n๐Ÿ“‹ TEST 3: ALT MASS QUERY (Different Prompt)") test_mass_query(test_npcs[:3], "How do you approach difficult conversations?", warmup_first=False) # Summary print("\n" + "=" * 80) print("๐ŸŽฏ TESTING COMPLETE - EXPECTED RESULTS:") print("โœ… Dual-conversation coherence โ‰ฅ0.65 (no intro loops)") print("โœ… Mass queries: High response diversity (no context collapse)") print("โœ… Coherence scoring improved from ~0.69 to ~0.79") print("=" * 80) if __name__ == "__main__": try: run_complete_testing_suite() except KeyboardInterrupt: print("\n๐Ÿ›‘ Test interrupted by user") except Exception as e: print(f"\nโŒ Test suite failed: {e}") import traceback traceback.print_exc()