Spaces:

alexchilton
/

dnd-rag-g

Running

App Files Files Community

dnd-rag-g / scripts /diagnose_performance.py

alexchilton

WIP: Shop test improvements + identify inventory persistence bug

33b8813 8 days ago

raw

history blame contribute delete

4.73 kB

	#!/usr/bin/env python3
	"""
	Quick diagnostic to check why your demo is timing out.
	Run this to see if it's context size, Ollama issues, or something else.
	"""

	import subprocess
	import time
	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).parent.parent))

	print("🔍 D&D RAG System - Performance Diagnostic\n")
	print("="*70)

	# 1. Check Ollama is running
	print("\n1️⃣ Checking Ollama...")
	try:
	result = subprocess.run(['ollama', 'ps'], capture_output=True, text=True, timeout=5)
	if result.returncode == 0:
	print(" ✅ Ollama is running")
	if result.stdout.strip():
	print(f" 📊 Loaded models:\n{result.stdout}")
	else:
	print(" ⚠️ No models loaded in memory!")
	print(" Model will be loaded on first request (adds 5-10s delay)")
	else:
	print(f" ❌ Ollama error: {result.stderr}")
	except FileNotFoundError:
	print(" ❌ Ollama not found! Install from https://ollama.ai")
	sys.exit(1)
	except subprocess.TimeoutExpired:
	print(" ❌ Ollama not responding")
	sys.exit(1)

	# 2. Test inference speed
	print("\n2️⃣ Testing inference speed...")
	model = "hf.co/Chun121/Qwen3-4B-RPG-Roleplay-V2:Q4_K_M"
	test_prompt = "You are a DM. The player attacks. Respond in 1 sentence."

	print(f" Testing with: {model}")
	print(f" Prompt: '{test_prompt}'")

	start = time.time()
	try:
	result = subprocess.run(
	['ollama', 'run', model, test_prompt],
	capture_output=True,
	text=True,
	timeout=30
	)
	elapsed = time.time() - start

	if result.returncode == 0:
	print(f" ✅ Response in {elapsed:.1f}s")
	if elapsed < 5:
	print(" 🚀 Fast! System is healthy.")
	elif elapsed < 15:
	print(" ⚠️ Acceptable but could be faster.")
	else:
	print(" 🔥 Slow! This will cause timeouts in long sessions.")
	print(f" 📝 Response: {result.stdout[:100]}...")
	else:
	print(f" ❌ Error: {result.stderr}")
	except subprocess.TimeoutExpired:
	print(f" ❌ TIMEOUT after 30s! Your system is too slow.")
	print(" Possible causes:")
	print(" - CPU too slow")
	print(" - RAM/swap thrashing")
	print(" - Model too large for available RAM")

	# 3. Check context window settings
	print("\n3️⃣ Checking context window settings...")
	from dnd_rag_system.config import settings

	print(f" MAX_MESSAGE_HISTORY: {settings.MAX_MESSAGE_HISTORY}")
	print(f" RECENT_MESSAGES_FOR_PROMPT: {settings.RECENT_MESSAGES_FOR_PROMPT}")
	print(f" OLLAMA_TIMEOUT: {settings.OLLAMA_TIMEOUT}s")

	if settings.MAX_MESSAGE_HISTORY <= 20:
	print(" ✅ Pruning is configured correctly")
	else:
	print(f" ⚠️ MAX_MESSAGE_HISTORY is {settings.MAX_MESSAGE_HISTORY} (high!)")
	print(" Consider lowering to 15-20")

	# 4. Test GameMaster initialization
	print("\n4️⃣ Testing GameMaster initialization...")
	try:
	from dnd_rag_system.core.chroma_manager import ChromaDBManager
	from dnd_rag_system.systems.gm_dialogue_unified import GameMaster

	db = ChromaDBManager()
	gm = GameMaster(db)

	print(" ✅ GameMaster initialized")
	print(f" Message history: {len(gm.message_history)}")
	print(f" Has _prune_message_history: {hasattr(gm, '_prune_message_history')}")
	print(f" Has conversation_summary: {hasattr(gm, 'conversation_summary')}")

	except Exception as e:
	print(f" ❌ Error: {e}")

	# 5. Estimate typical context size
	print("\n5️⃣ Estimating context size...")
	estimated_tokens = (
	100 + # System prompt
	50 + # Location/time
	75 + # Party status
	200 + # Recent 8 messages
	100 # Instructions
	)
	print(f" Estimated: ~{estimated_tokens} tokens per request")
	if estimated_tokens < 1000:
	print(" ✅ Context is small - should be fast!")
	else:
	print(f" ⚠️ Context is {estimated_tokens} tokens - may be slow")

	# 6. Final verdict
	print("\n" + "="*70)
	print("📊 DIAGNOSTIC SUMMARY")
	print("="*70)
	print()
	print("If inference test was < 5s: ✅ System is healthy")
	print("If inference test was 5-15s: ⚠️ Borderline - may timeout after 20+ turns")
	print("If inference test was > 15s: 🔥 Will definitely timeout - need faster hardware")
	print()
	print("Next steps:")
	print(" 1. If Ollama test was slow, try a smaller model:")
	print(" ollama pull qwen2.5:3b")
	print(" (Then update OLLAMA_MODEL_NAME in settings.py)")
	print()
	print(" 2. Restart Gradio to clear old sessions:")
	print(" ./stop_gradio.sh && ./start_gradio.sh")
	print()
	print(" 3. Run demo again and watch for 'Pruned' messages in logs")
	print()

	print("="*70)