Spaces:

Bellok
/

warbler-cda

Sleeping

warbler-cda / test_multiagent_complete.py

Bellok

feat(docs, refactor): add NPC Chat API integration guide and update data ingestion

ec2d906 about 2 months ago

11.1 kB

	#!/usr/bin/env python3
	"""
	Test dual NPC interactions with warm-up phase for proper multi-agent conversations.

	This implements the complete testing protocol to solve:
	1. Intro loop problem (dialogue prioritization over biography)
	2. Context collapse (warm-up ensures dialogue anchors exist)
	3. Mass query diversity (fallback context prevents empty retrieval)

	Based on Perplexity.ai diagnostics for proper NPC-to-NPC testing.
	"""

	import requests
	import json
	import time
	from datetime import datetime

	API_BASE = "http://localhost:8000"

	def warm_up_npc(npc_id, warmup_turns=3):
	"""Pre-populate an NPC with dialogue anchors to prevent cold start problems."""
	print(f"🔄 Warming up {npc_id} with {warmup_turns} dialogue turns...")

	warmup_prompts = [
	"Who are you and what is your purpose?",
	"Tell me about yourself in more detail.",
	"What makes you unique in your role?"
	]

	for turn in range(min(warmup_turns, len(warmup_prompts))):
	response = requests.post(f"{API_BASE}/npc/chat", json={
	"npc_id": npc_id,
	"player_id": "warmup-system",
	"message": warmup_prompts[turn]
	}, timeout=30)

	if response.status_code == 200:
	result = response.json()
	coherence = result.get('coherence_score', 0.0)
	print(".3f")
	else:
	print(f" ⚠️ Warm-up turn {turn+1} failed: {response.status_code}")

	time.sleep(0.2) # Brief pause between turns

	print(f"✅ {npc_id} warmed up with dialogue history")

	def test_dual_npc_conversation(npc_a_id, npc_b_id, turns=30):
	"""Test NPC-to-NPC conversation with proper warm-up."""
	print(f"\n{'='*70}")
	print(f"🗣️ TESTING DUAL NPC CONVERSATION: {npc_a_id} ↔ {npc_b_id}")
	print(f"{'='*70}")

	# Warm up both NPCs to ensure dialogue anchors exist
	warm_up_npc(npc_a_id, warmup_turns=3)
	warm_up_npc(npc_b_id, warmup_turns=3)

	# Track metrics
	conversation_log = []
	coherence_scores = []

	# Initialize conversation with Alice greeting Bob
	current_speaker = npc_a_id
	other_speaker = npc_b_id
	last_message = "Hello there! I've been thinking about how we can work together to improve our conversations."

	print(f"\nStarting {turns}-turn conversation...")
	print("-" * 50)

	for turn in range(1, turns + 1):
	# Current speaker responds to last message
	response = requests.post(f"{API_BASE}/npc/chat", json={
	"npc_id": current_speaker,
	"player_id": "npc-system",
	"message": last_message
	}, timeout=30)

	if response.status_code != 200:
	print(f"❌ Turn {turn} failed: {response.status_code} - {response.text[:100]}...")
	break

	result = response.json()
	coherence_scores.append(result['coherence_score'])

	# Display turn information
	response_text = result['npc_response'][:80]
	print(f"Turn {turn:2d}: {current_speaker}")
	print(f" 💬 {response_text}...")
	print(".3f")

	conversation_log.append({
	"turn": turn,
	"speaker": current_speaker,
	"input_message": last_message,
	"response": result['npc_response'],
	"coherence": result['coherence_score'],
	"emotion": result['emotion'],
	"intent": result['intent'],
	})

	# Check for self-consumption metrics every 5 turns
	if turn % 5 == 0:
	try:
	metrics_response = requests.get(f"{API_BASE}/npc/metrics/self-consumption")
	if metrics_response.status_code == 200:
	metrics = metrics_response.json()
	anchors = metrics.get('anchors_created', 0)
	micros = metrics.get('micro_summaries_distilled', 0)
	macros = metrics.get('macro_distillations_created', 0)
	print(".1f")
	except Exception as e:
	print(f" 📊 Could not retrieve metrics: {e}")

	# Switch speakers
	current_speaker, other_speaker = other_speaker, current_speaker
	last_message = result['npc_response']

	# Analysis
	print(f"\n{'='*70}")
	print("📊 CONVERSATION ANALYSIS")
	print(f"{'='*70}")

	if coherence_scores:
	avg_coherence = sum(coherence_scores) / len(coherence_scores)
	min_coherence = min(coherence_scores)
	max_coherence = max(coherence_scores)
	trend = "📈 Improving" if coherence_scores[-1] > coherence_scores[0] else "📉 Degrading"

	print("Coherence Metrics:")
	print(".3f")
	print(".3f")
	print(".3f")
	print(f" Trend: {trend}")

	# Check for intro loop
	intro_responses = [log for log in conversation_log
	if any(phrase in log['response'].lower()
	for phrase in ['i am', 'my name is', 'hello', 'greetings'])]

	if len(intro_responses) > 4: # More than 4 intros in 30 turns
	print("❌ INTRO LOOP DETECTED: NPCs repeatedly introducing themselves")
	print(f" Found {len(intro_responses)} introduction-like responses")
	else:
	print("✅ CONVERSATION FLOW: NPCs moving beyond introductions")

	# Overall assessment
	if avg_coherence >= 0.65:
	print("🌟 SUCCESS: Conversation coherence meets target (≥0.65)")
	else:
	print(".3f")

	else:
	print("No coherence scores available for analysis")

	# Save detailed log
	timestamp = int(time.time())
	log_filename = f"npc_conversation_{npc_a_id}_{npc_b_id}_{timestamp}.json"
	with open(log_filename, 'w') as f:
	json.dump({
	"test_metadata": {
	"npc_a": npc_a_id,
	"npc_b": npc_b_id,
	"turns_attempted": turns,
	"turns_completed": len(conversation_log),
	"test_timestamp": timestamp,
	"avg_coherence": avg_coherence if coherence_scores else 0.0
	},
	"conversation_log": conversation_log
	}, f, indent=2)

	print(f"💾 Detailed log saved to: {log_filename}")

	def test_mass_query(npc_ids, prompt, warmup_first=True):
	"""Test mass query - should get diverse responses, not all identical."""
	print(f"\n{'='*70}")
	print(f"🎯 TESTING MASS QUERY: '{prompt}'")
	print(f"📡 Testing {len(npc_ids)} NPCs: {', '.join(npc_ids)}")
	print(f"{'='*70}")

	# Optional warm-up to ensure dialogue anchors
	if warmup_first:
	print("🔄 Warming up all NPCs for fair comparison...")
	for npc_id in npc_ids:
	warm_up_npc(npc_id, warmup_turns=2)
	print("✅ All NPCs warmed up")

	print("\n🚀 Executing mass query...")
	responses = {}

	for npc_id in npc_ids:
	response = requests.post(f"{API_BASE}/npc/chat", json={
	"npc_id": npc_id,
	"player_id": "mass-query",
	"message": prompt
	}, timeout=30)

	if response.status_code == 200:
	result = response.json()
	npc_response = result['npc_response'][:120] # Truncate for display
	responses[npc_id] = {
	'response': npc_response,
	'coherence': result.get('coherence_score', 0.0),
	'emotion': result.get('emotion', 'unknown'),
	'turn_number': result.get('turn_number', 0)
	}
	print(f" {npc_id}: {npc_response}...")
	else:
	print(f" ⚠️ {npc_id}: Failed ({response.status_code})")
	responses[npc_id] = {'response': 'FAILED', 'coherence': 0.0}

	# Analysis
	successful_responses = [r['response'] for r in responses.values() if r['response'] != 'FAILED']
	unique_responses = len(set(successful_responses))
	total_responses = len(successful_responses)

	print(f"\n📊 MASS QUERY ANALYSIS")
	print(f"{'='*70}")

	print("Response Diversity:")
	print(f" Total responses: {total_responses}")
	print(f" Unique responses: {unique_responses}")

	if unique_responses < total_responses * 0.7: # Less than 70% unique
	print("❌ CONTEXT COLLAPSE: Low response diversity detected")
	print(" NPCs giving similar/generic responses")
	print(" This indicates empty or identical context retrieval")

	# Show duplicate analysis
	from collections import Counter
	response_counts = Counter(successful_responses)
	duplicates = [(response, count) for response, count in response_counts.items() if count > 1]
	if duplicates:
	print(" Duplicate responses found:")
	for response, count in duplicates[:3]: # Show top 3 duplicates
	print(f" {count}x: '{response[:60]}...'")
	else:
	print("✅ GOOD DIVERSITY: NPCs giving unique, contextual responses")

	# Coherence analysis
	coherences = [r['coherence'] for r in responses.values() if r['response'] != 'FAILED']
	if coherences:
	avg_coherence = sum(coherences) / len(coherences)
	print(".3f")

	if avg_coherence < 0.6:
	print("❌ LOW COHERENCE: Responses lack context consistency")
	elif avg_coherence >= 0.7:
	print("✅ HIGH COHERENCE: Responses are contextually coherent")
	else:
	print("⚠️ MODERATE COHERENCE: Mixed quality responses")

	return responses

	def run_complete_testing_suite():
	"""Run the complete testing suite with proper sequencing."""
	print("🧪 STARTING WARBLER CDA MULTI-AGENT TESTING SUITE")
	print("🏷️ Timestamp: " + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
	print("=" * 80)

	# Test 1: Dual NPC conversation (the core problem)
	print("📋 TEST 1: DUAL NPC CONVERSATION")
	test_dual_npc_conversation("alice-clean", "bob-skeptic", turns=30)

	# Test 2: Mass query diversity
	print("\n📋 TEST 2: MASS QUERY DIVERSITY")
	test_npcs = ["alice-clean", "bob-skeptic", "gandalf-wizard", "elara-guardian"]
	test_mass_query(test_npcs, "What is your greatest achievement in life?", warmup_first=True)

	# Test 3: Different mass query to check for generic responses
	print("\n📋 TEST 3: ALT MASS QUERY (Different Prompt)")
	test_mass_query(test_npcs[:3], "How do you approach difficult conversations?", warmup_first=False)

	# Summary
	print("\n" + "=" * 80)
	print("🎯 TESTING COMPLETE - EXPECTED RESULTS:")
	print("✅ Dual-conversation coherence ≥0.65 (no intro loops)")
	print("✅ Mass queries: High response diversity (no context collapse)")
	print("✅ Coherence scoring improved from ~0.69 to ~0.79")
	print("=" * 80)

	if __name__ == "__main__":
	try:
	run_complete_testing_suite()
	except KeyboardInterrupt:
	print("\n🛑 Test interrupted by user")
	except Exception as e:
	print(f"\n❌ Test suite failed: {e}")
	import traceback
	traceback.print_exc()