warbler-cda / test_multiagent_complete.py
Bellok
feat(docs, refactor): add NPC Chat API integration guide and update data ingestion
ec2d906
raw
history blame
11.1 kB
#!/usr/bin/env python3
"""
Test dual NPC interactions with warm-up phase for proper multi-agent conversations.
This implements the complete testing protocol to solve:
1. Intro loop problem (dialogue prioritization over biography)
2. Context collapse (warm-up ensures dialogue anchors exist)
3. Mass query diversity (fallback context prevents empty retrieval)
Based on Perplexity.ai diagnostics for proper NPC-to-NPC testing.
"""
import requests
import json
import time
from datetime import datetime
API_BASE = "http://localhost:8000"
def warm_up_npc(npc_id, warmup_turns=3):
"""Pre-populate an NPC with dialogue anchors to prevent cold start problems."""
print(f"πŸ”„ Warming up {npc_id} with {warmup_turns} dialogue turns...")
warmup_prompts = [
"Who are you and what is your purpose?",
"Tell me about yourself in more detail.",
"What makes you unique in your role?"
]
for turn in range(min(warmup_turns, len(warmup_prompts))):
response = requests.post(f"{API_BASE}/npc/chat", json={
"npc_id": npc_id,
"player_id": "warmup-system",
"message": warmup_prompts[turn]
}, timeout=30)
if response.status_code == 200:
result = response.json()
coherence = result.get('coherence_score', 0.0)
print(".3f")
else:
print(f" ⚠️ Warm-up turn {turn+1} failed: {response.status_code}")
time.sleep(0.2) # Brief pause between turns
print(f"βœ… {npc_id} warmed up with dialogue history")
def test_dual_npc_conversation(npc_a_id, npc_b_id, turns=30):
"""Test NPC-to-NPC conversation with proper warm-up."""
print(f"\n{'='*70}")
print(f"πŸ—£οΈ TESTING DUAL NPC CONVERSATION: {npc_a_id} ↔ {npc_b_id}")
print(f"{'='*70}")
# Warm up both NPCs to ensure dialogue anchors exist
warm_up_npc(npc_a_id, warmup_turns=3)
warm_up_npc(npc_b_id, warmup_turns=3)
# Track metrics
conversation_log = []
coherence_scores = []
# Initialize conversation with Alice greeting Bob
current_speaker = npc_a_id
other_speaker = npc_b_id
last_message = "Hello there! I've been thinking about how we can work together to improve our conversations."
print(f"\nStarting {turns}-turn conversation...")
print("-" * 50)
for turn in range(1, turns + 1):
# Current speaker responds to last message
response = requests.post(f"{API_BASE}/npc/chat", json={
"npc_id": current_speaker,
"player_id": "npc-system",
"message": last_message
}, timeout=30)
if response.status_code != 200:
print(f"❌ Turn {turn} failed: {response.status_code} - {response.text[:100]}...")
break
result = response.json()
coherence_scores.append(result['coherence_score'])
# Display turn information
response_text = result['npc_response'][:80]
print(f"Turn {turn:2d}: {current_speaker}")
print(f" πŸ’¬ {response_text}...")
print(".3f")
conversation_log.append({
"turn": turn,
"speaker": current_speaker,
"input_message": last_message,
"response": result['npc_response'],
"coherence": result['coherence_score'],
"emotion": result['emotion'],
"intent": result['intent'],
})
# Check for self-consumption metrics every 5 turns
if turn % 5 == 0:
try:
metrics_response = requests.get(f"{API_BASE}/npc/metrics/self-consumption")
if metrics_response.status_code == 200:
metrics = metrics_response.json()
anchors = metrics.get('anchors_created', 0)
micros = metrics.get('micro_summaries_distilled', 0)
macros = metrics.get('macro_distillations_created', 0)
print(".1f")
except Exception as e:
print(f" πŸ“Š Could not retrieve metrics: {e}")
# Switch speakers
current_speaker, other_speaker = other_speaker, current_speaker
last_message = result['npc_response']
# Analysis
print(f"\n{'='*70}")
print("πŸ“Š CONVERSATION ANALYSIS")
print(f"{'='*70}")
if coherence_scores:
avg_coherence = sum(coherence_scores) / len(coherence_scores)
min_coherence = min(coherence_scores)
max_coherence = max(coherence_scores)
trend = "πŸ“ˆ Improving" if coherence_scores[-1] > coherence_scores[0] else "πŸ“‰ Degrading"
print("Coherence Metrics:")
print(".3f")
print(".3f")
print(".3f")
print(f" Trend: {trend}")
# Check for intro loop
intro_responses = [log for log in conversation_log
if any(phrase in log['response'].lower()
for phrase in ['i am', 'my name is', 'hello', 'greetings'])]
if len(intro_responses) > 4: # More than 4 intros in 30 turns
print("❌ INTRO LOOP DETECTED: NPCs repeatedly introducing themselves")
print(f" Found {len(intro_responses)} introduction-like responses")
else:
print("βœ… CONVERSATION FLOW: NPCs moving beyond introductions")
# Overall assessment
if avg_coherence >= 0.65:
print("🌟 SUCCESS: Conversation coherence meets target (β‰₯0.65)")
else:
print(".3f")
else:
print("No coherence scores available for analysis")
# Save detailed log
timestamp = int(time.time())
log_filename = f"npc_conversation_{npc_a_id}_{npc_b_id}_{timestamp}.json"
with open(log_filename, 'w') as f:
json.dump({
"test_metadata": {
"npc_a": npc_a_id,
"npc_b": npc_b_id,
"turns_attempted": turns,
"turns_completed": len(conversation_log),
"test_timestamp": timestamp,
"avg_coherence": avg_coherence if coherence_scores else 0.0
},
"conversation_log": conversation_log
}, f, indent=2)
print(f"πŸ’Ύ Detailed log saved to: {log_filename}")
def test_mass_query(npc_ids, prompt, warmup_first=True):
"""Test mass query - should get diverse responses, not all identical."""
print(f"\n{'='*70}")
print(f"🎯 TESTING MASS QUERY: '{prompt}'")
print(f"πŸ“‘ Testing {len(npc_ids)} NPCs: {', '.join(npc_ids)}")
print(f"{'='*70}")
# Optional warm-up to ensure dialogue anchors
if warmup_first:
print("πŸ”„ Warming up all NPCs for fair comparison...")
for npc_id in npc_ids:
warm_up_npc(npc_id, warmup_turns=2)
print("βœ… All NPCs warmed up")
print("\nπŸš€ Executing mass query...")
responses = {}
for npc_id in npc_ids:
response = requests.post(f"{API_BASE}/npc/chat", json={
"npc_id": npc_id,
"player_id": "mass-query",
"message": prompt
}, timeout=30)
if response.status_code == 200:
result = response.json()
npc_response = result['npc_response'][:120] # Truncate for display
responses[npc_id] = {
'response': npc_response,
'coherence': result.get('coherence_score', 0.0),
'emotion': result.get('emotion', 'unknown'),
'turn_number': result.get('turn_number', 0)
}
print(f" {npc_id}: {npc_response}...")
else:
print(f" ⚠️ {npc_id}: Failed ({response.status_code})")
responses[npc_id] = {'response': 'FAILED', 'coherence': 0.0}
# Analysis
successful_responses = [r['response'] for r in responses.values() if r['response'] != 'FAILED']
unique_responses = len(set(successful_responses))
total_responses = len(successful_responses)
print(f"\nπŸ“Š MASS QUERY ANALYSIS")
print(f"{'='*70}")
print("Response Diversity:")
print(f" Total responses: {total_responses}")
print(f" Unique responses: {unique_responses}")
if unique_responses < total_responses * 0.7: # Less than 70% unique
print("❌ CONTEXT COLLAPSE: Low response diversity detected")
print(" NPCs giving similar/generic responses")
print(" This indicates empty or identical context retrieval")
# Show duplicate analysis
from collections import Counter
response_counts = Counter(successful_responses)
duplicates = [(response, count) for response, count in response_counts.items() if count > 1]
if duplicates:
print(" Duplicate responses found:")
for response, count in duplicates[:3]: # Show top 3 duplicates
print(f" {count}x: '{response[:60]}...'")
else:
print("βœ… GOOD DIVERSITY: NPCs giving unique, contextual responses")
# Coherence analysis
coherences = [r['coherence'] for r in responses.values() if r['response'] != 'FAILED']
if coherences:
avg_coherence = sum(coherences) / len(coherences)
print(".3f")
if avg_coherence < 0.6:
print("❌ LOW COHERENCE: Responses lack context consistency")
elif avg_coherence >= 0.7:
print("βœ… HIGH COHERENCE: Responses are contextually coherent")
else:
print("⚠️ MODERATE COHERENCE: Mixed quality responses")
return responses
def run_complete_testing_suite():
"""Run the complete testing suite with proper sequencing."""
print("πŸ§ͺ STARTING WARBLER CDA MULTI-AGENT TESTING SUITE")
print("🏷️ Timestamp: " + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print("=" * 80)
# Test 1: Dual NPC conversation (the core problem)
print("πŸ“‹ TEST 1: DUAL NPC CONVERSATION")
test_dual_npc_conversation("alice-clean", "bob-skeptic", turns=30)
# Test 2: Mass query diversity
print("\nπŸ“‹ TEST 2: MASS QUERY DIVERSITY")
test_npcs = ["alice-clean", "bob-skeptic", "gandalf-wizard", "elara-guardian"]
test_mass_query(test_npcs, "What is your greatest achievement in life?", warmup_first=True)
# Test 3: Different mass query to check for generic responses
print("\nπŸ“‹ TEST 3: ALT MASS QUERY (Different Prompt)")
test_mass_query(test_npcs[:3], "How do you approach difficult conversations?", warmup_first=False)
# Summary
print("\n" + "=" * 80)
print("🎯 TESTING COMPLETE - EXPECTED RESULTS:")
print("βœ… Dual-conversation coherence β‰₯0.65 (no intro loops)")
print("βœ… Mass queries: High response diversity (no context collapse)")
print("βœ… Coherence scoring improved from ~0.69 to ~0.79")
print("=" * 80)
if __name__ == "__main__":
try:
run_complete_testing_suite()
except KeyboardInterrupt:
print("\nπŸ›‘ Test interrupted by user")
except Exception as e:
print(f"\n❌ Test suite failed: {e}")
import traceback
traceback.print_exc()