Spaces:
Running
Running
File size: 4,730 Bytes
42413d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | #!/usr/bin/env python3
"""
Quick diagnostic to check why your demo is timing out.
Run this to see if it's context size, Ollama issues, or something else.
"""
import subprocess
import time
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
print("π D&D RAG System - Performance Diagnostic\n")
print("="*70)
# 1. Check Ollama is running
print("\n1οΈβ£ Checking Ollama...")
try:
result = subprocess.run(['ollama', 'ps'], capture_output=True, text=True, timeout=5)
if result.returncode == 0:
print(" β
Ollama is running")
if result.stdout.strip():
print(f" π Loaded models:\n{result.stdout}")
else:
print(" β οΈ No models loaded in memory!")
print(" Model will be loaded on first request (adds 5-10s delay)")
else:
print(f" β Ollama error: {result.stderr}")
except FileNotFoundError:
print(" β Ollama not found! Install from https://ollama.ai")
sys.exit(1)
except subprocess.TimeoutExpired:
print(" β Ollama not responding")
sys.exit(1)
# 2. Test inference speed
print("\n2οΈβ£ Testing inference speed...")
model = "hf.co/Chun121/Qwen3-4B-RPG-Roleplay-V2:Q4_K_M"
test_prompt = "You are a DM. The player attacks. Respond in 1 sentence."
print(f" Testing with: {model}")
print(f" Prompt: '{test_prompt}'")
start = time.time()
try:
result = subprocess.run(
['ollama', 'run', model, test_prompt],
capture_output=True,
text=True,
timeout=30
)
elapsed = time.time() - start
if result.returncode == 0:
print(f" β
Response in {elapsed:.1f}s")
if elapsed < 5:
print(" π Fast! System is healthy.")
elif elapsed < 15:
print(" β οΈ Acceptable but could be faster.")
else:
print(" π₯ Slow! This will cause timeouts in long sessions.")
print(f" π Response: {result.stdout[:100]}...")
else:
print(f" β Error: {result.stderr}")
except subprocess.TimeoutExpired:
print(f" β TIMEOUT after 30s! Your system is too slow.")
print(" Possible causes:")
print(" - CPU too slow")
print(" - RAM/swap thrashing")
print(" - Model too large for available RAM")
# 3. Check context window settings
print("\n3οΈβ£ Checking context window settings...")
from dnd_rag_system.config import settings
print(f" MAX_MESSAGE_HISTORY: {settings.MAX_MESSAGE_HISTORY}")
print(f" RECENT_MESSAGES_FOR_PROMPT: {settings.RECENT_MESSAGES_FOR_PROMPT}")
print(f" OLLAMA_TIMEOUT: {settings.OLLAMA_TIMEOUT}s")
if settings.MAX_MESSAGE_HISTORY <= 20:
print(" β
Pruning is configured correctly")
else:
print(f" β οΈ MAX_MESSAGE_HISTORY is {settings.MAX_MESSAGE_HISTORY} (high!)")
print(" Consider lowering to 15-20")
# 4. Test GameMaster initialization
print("\n4οΈβ£ Testing GameMaster initialization...")
try:
from dnd_rag_system.core.chroma_manager import ChromaDBManager
from dnd_rag_system.systems.gm_dialogue_unified import GameMaster
db = ChromaDBManager()
gm = GameMaster(db)
print(" β
GameMaster initialized")
print(f" Message history: {len(gm.message_history)}")
print(f" Has _prune_message_history: {hasattr(gm, '_prune_message_history')}")
print(f" Has conversation_summary: {hasattr(gm, 'conversation_summary')}")
except Exception as e:
print(f" β Error: {e}")
# 5. Estimate typical context size
print("\n5οΈβ£ Estimating context size...")
estimated_tokens = (
100 + # System prompt
50 + # Location/time
75 + # Party status
200 + # Recent 8 messages
100 # Instructions
)
print(f" Estimated: ~{estimated_tokens} tokens per request")
if estimated_tokens < 1000:
print(" β
Context is small - should be fast!")
else:
print(f" β οΈ Context is {estimated_tokens} tokens - may be slow")
# 6. Final verdict
print("\n" + "="*70)
print("π DIAGNOSTIC SUMMARY")
print("="*70)
print()
print("If inference test was < 5s: β
System is healthy")
print("If inference test was 5-15s: β οΈ Borderline - may timeout after 20+ turns")
print("If inference test was > 15s: π₯ Will definitely timeout - need faster hardware")
print()
print("Next steps:")
print(" 1. If Ollama test was slow, try a smaller model:")
print(" ollama pull qwen2.5:3b")
print(" (Then update OLLAMA_MODEL_NAME in settings.py)")
print()
print(" 2. Restart Gradio to clear old sessions:")
print(" ./stop_gradio.sh && ./start_gradio.sh")
print()
print(" 3. Run demo again and watch for 'Pruned' messages in logs")
print()
print("="*70)
|