Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Test dual NPC interactions with warm-up phase for proper multi-agent conversations. | |
| This implements the complete testing protocol to solve: | |
| 1. Intro loop problem (dialogue prioritization over biography) | |
| 2. Context collapse (warm-up ensures dialogue anchors exist) | |
| 3. Mass query diversity (fallback context prevents empty retrieval) | |
| Based on Perplexity.ai diagnostics for proper NPC-to-NPC testing. | |
| """ | |
| import requests | |
| import json | |
| import time | |
| from datetime import datetime | |
| API_BASE = "http://localhost:8000" | |
| def warm_up_npc(npc_id, warmup_turns=3): | |
| """Pre-populate an NPC with dialogue anchors to prevent cold start problems.""" | |
| print(f"π Warming up {npc_id} with {warmup_turns} dialogue turns...") | |
| warmup_prompts = [ | |
| "Who are you and what is your purpose?", | |
| "Tell me about yourself in more detail.", | |
| "What makes you unique in your role?" | |
| ] | |
| for turn in range(min(warmup_turns, len(warmup_prompts))): | |
| response = requests.post(f"{API_BASE}/npc/chat", json={ | |
| "npc_id": npc_id, | |
| "player_id": "warmup-system", | |
| "message": warmup_prompts[turn] | |
| }, timeout=30) | |
| if response.status_code == 200: | |
| result = response.json() | |
| coherence = result.get('coherence_score', 0.0) | |
| print(".3f") | |
| else: | |
| print(f" β οΈ Warm-up turn {turn+1} failed: {response.status_code}") | |
| time.sleep(0.2) # Brief pause between turns | |
| print(f"β {npc_id} warmed up with dialogue history") | |
| def test_dual_npc_conversation(npc_a_id, npc_b_id, turns=30): | |
| """Test NPC-to-NPC conversation with proper warm-up.""" | |
| print(f"\n{'='*70}") | |
| print(f"π£οΈ TESTING DUAL NPC CONVERSATION: {npc_a_id} β {npc_b_id}") | |
| print(f"{'='*70}") | |
| # Warm up both NPCs to ensure dialogue anchors exist | |
| warm_up_npc(npc_a_id, warmup_turns=3) | |
| warm_up_npc(npc_b_id, warmup_turns=3) | |
| # Track metrics | |
| conversation_log = [] | |
| coherence_scores = [] | |
| # Initialize conversation with Alice greeting Bob | |
| current_speaker = npc_a_id | |
| other_speaker = npc_b_id | |
| last_message = "Hello there! I've been thinking about how we can work together to improve our conversations." | |
| print(f"\nStarting {turns}-turn conversation...") | |
| print("-" * 50) | |
| for turn in range(1, turns + 1): | |
| # Current speaker responds to last message | |
| response = requests.post(f"{API_BASE}/npc/chat", json={ | |
| "npc_id": current_speaker, | |
| "player_id": "npc-system", | |
| "message": last_message | |
| }, timeout=30) | |
| if response.status_code != 200: | |
| print(f"β Turn {turn} failed: {response.status_code} - {response.text[:100]}...") | |
| break | |
| result = response.json() | |
| coherence_scores.append(result['coherence_score']) | |
| # Display turn information | |
| response_text = result['npc_response'][:80] | |
| print(f"Turn {turn:2d}: {current_speaker}") | |
| print(f" π¬ {response_text}...") | |
| print(".3f") | |
| conversation_log.append({ | |
| "turn": turn, | |
| "speaker": current_speaker, | |
| "input_message": last_message, | |
| "response": result['npc_response'], | |
| "coherence": result['coherence_score'], | |
| "emotion": result['emotion'], | |
| "intent": result['intent'], | |
| }) | |
| # Check for self-consumption metrics every 5 turns | |
| if turn % 5 == 0: | |
| try: | |
| metrics_response = requests.get(f"{API_BASE}/npc/metrics/self-consumption") | |
| if metrics_response.status_code == 200: | |
| metrics = metrics_response.json() | |
| anchors = metrics.get('anchors_created', 0) | |
| micros = metrics.get('micro_summaries_distilled', 0) | |
| macros = metrics.get('macro_distillations_created', 0) | |
| print(".1f") | |
| except Exception as e: | |
| print(f" π Could not retrieve metrics: {e}") | |
| # Switch speakers | |
| current_speaker, other_speaker = other_speaker, current_speaker | |
| last_message = result['npc_response'] | |
| # Analysis | |
| print(f"\n{'='*70}") | |
| print("π CONVERSATION ANALYSIS") | |
| print(f"{'='*70}") | |
| if coherence_scores: | |
| avg_coherence = sum(coherence_scores) / len(coherence_scores) | |
| min_coherence = min(coherence_scores) | |
| max_coherence = max(coherence_scores) | |
| trend = "π Improving" if coherence_scores[-1] > coherence_scores[0] else "π Degrading" | |
| print("Coherence Metrics:") | |
| print(".3f") | |
| print(".3f") | |
| print(".3f") | |
| print(f" Trend: {trend}") | |
| # Check for intro loop | |
| intro_responses = [log for log in conversation_log | |
| if any(phrase in log['response'].lower() | |
| for phrase in ['i am', 'my name is', 'hello', 'greetings'])] | |
| if len(intro_responses) > 4: # More than 4 intros in 30 turns | |
| print("β INTRO LOOP DETECTED: NPCs repeatedly introducing themselves") | |
| print(f" Found {len(intro_responses)} introduction-like responses") | |
| else: | |
| print("β CONVERSATION FLOW: NPCs moving beyond introductions") | |
| # Overall assessment | |
| if avg_coherence >= 0.65: | |
| print("π SUCCESS: Conversation coherence meets target (β₯0.65)") | |
| else: | |
| print(".3f") | |
| else: | |
| print("No coherence scores available for analysis") | |
| # Save detailed log | |
| timestamp = int(time.time()) | |
| log_filename = f"npc_conversation_{npc_a_id}_{npc_b_id}_{timestamp}.json" | |
| with open(log_filename, 'w') as f: | |
| json.dump({ | |
| "test_metadata": { | |
| "npc_a": npc_a_id, | |
| "npc_b": npc_b_id, | |
| "turns_attempted": turns, | |
| "turns_completed": len(conversation_log), | |
| "test_timestamp": timestamp, | |
| "avg_coherence": avg_coherence if coherence_scores else 0.0 | |
| }, | |
| "conversation_log": conversation_log | |
| }, f, indent=2) | |
| print(f"πΎ Detailed log saved to: {log_filename}") | |
| def test_mass_query(npc_ids, prompt, warmup_first=True): | |
| """Test mass query - should get diverse responses, not all identical.""" | |
| print(f"\n{'='*70}") | |
| print(f"π― TESTING MASS QUERY: '{prompt}'") | |
| print(f"π‘ Testing {len(npc_ids)} NPCs: {', '.join(npc_ids)}") | |
| print(f"{'='*70}") | |
| # Optional warm-up to ensure dialogue anchors | |
| if warmup_first: | |
| print("π Warming up all NPCs for fair comparison...") | |
| for npc_id in npc_ids: | |
| warm_up_npc(npc_id, warmup_turns=2) | |
| print("β All NPCs warmed up") | |
| print("\nπ Executing mass query...") | |
| responses = {} | |
| for npc_id in npc_ids: | |
| response = requests.post(f"{API_BASE}/npc/chat", json={ | |
| "npc_id": npc_id, | |
| "player_id": "mass-query", | |
| "message": prompt | |
| }, timeout=30) | |
| if response.status_code == 200: | |
| result = response.json() | |
| npc_response = result['npc_response'][:120] # Truncate for display | |
| responses[npc_id] = { | |
| 'response': npc_response, | |
| 'coherence': result.get('coherence_score', 0.0), | |
| 'emotion': result.get('emotion', 'unknown'), | |
| 'turn_number': result.get('turn_number', 0) | |
| } | |
| print(f" {npc_id}: {npc_response}...") | |
| else: | |
| print(f" β οΈ {npc_id}: Failed ({response.status_code})") | |
| responses[npc_id] = {'response': 'FAILED', 'coherence': 0.0} | |
| # Analysis | |
| successful_responses = [r['response'] for r in responses.values() if r['response'] != 'FAILED'] | |
| unique_responses = len(set(successful_responses)) | |
| total_responses = len(successful_responses) | |
| print(f"\nπ MASS QUERY ANALYSIS") | |
| print(f"{'='*70}") | |
| print("Response Diversity:") | |
| print(f" Total responses: {total_responses}") | |
| print(f" Unique responses: {unique_responses}") | |
| if unique_responses < total_responses * 0.7: # Less than 70% unique | |
| print("β CONTEXT COLLAPSE: Low response diversity detected") | |
| print(" NPCs giving similar/generic responses") | |
| print(" This indicates empty or identical context retrieval") | |
| # Show duplicate analysis | |
| from collections import Counter | |
| response_counts = Counter(successful_responses) | |
| duplicates = [(response, count) for response, count in response_counts.items() if count > 1] | |
| if duplicates: | |
| print(" Duplicate responses found:") | |
| for response, count in duplicates[:3]: # Show top 3 duplicates | |
| print(f" {count}x: '{response[:60]}...'") | |
| else: | |
| print("β GOOD DIVERSITY: NPCs giving unique, contextual responses") | |
| # Coherence analysis | |
| coherences = [r['coherence'] for r in responses.values() if r['response'] != 'FAILED'] | |
| if coherences: | |
| avg_coherence = sum(coherences) / len(coherences) | |
| print(".3f") | |
| if avg_coherence < 0.6: | |
| print("β LOW COHERENCE: Responses lack context consistency") | |
| elif avg_coherence >= 0.7: | |
| print("β HIGH COHERENCE: Responses are contextually coherent") | |
| else: | |
| print("β οΈ MODERATE COHERENCE: Mixed quality responses") | |
| return responses | |
| def run_complete_testing_suite(): | |
| """Run the complete testing suite with proper sequencing.""" | |
| print("π§ͺ STARTING WARBLER CDA MULTI-AGENT TESTING SUITE") | |
| print("π·οΈ Timestamp: " + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) | |
| print("=" * 80) | |
| # Test 1: Dual NPC conversation (the core problem) | |
| print("π TEST 1: DUAL NPC CONVERSATION") | |
| test_dual_npc_conversation("alice-clean", "bob-skeptic", turns=30) | |
| # Test 2: Mass query diversity | |
| print("\nπ TEST 2: MASS QUERY DIVERSITY") | |
| test_npcs = ["alice-clean", "bob-skeptic", "gandalf-wizard", "elara-guardian"] | |
| test_mass_query(test_npcs, "What is your greatest achievement in life?", warmup_first=True) | |
| # Test 3: Different mass query to check for generic responses | |
| print("\nπ TEST 3: ALT MASS QUERY (Different Prompt)") | |
| test_mass_query(test_npcs[:3], "How do you approach difficult conversations?", warmup_first=False) | |
| # Summary | |
| print("\n" + "=" * 80) | |
| print("π― TESTING COMPLETE - EXPECTED RESULTS:") | |
| print("β Dual-conversation coherence β₯0.65 (no intro loops)") | |
| print("β Mass queries: High response diversity (no context collapse)") | |
| print("β Coherence scoring improved from ~0.69 to ~0.79") | |
| print("=" * 80) | |
| if __name__ == "__main__": | |
| try: | |
| run_complete_testing_suite() | |
| except KeyboardInterrupt: | |
| print("\nπ Test interrupted by user") | |
| except Exception as e: | |
| print(f"\nβ Test suite failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |