File size: 4,730 Bytes
42413d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
"""
Quick diagnostic to check why your demo is timing out.
Run this to see if it's context size, Ollama issues, or something else.
"""

import subprocess
import time
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

print("πŸ” D&D RAG System - Performance Diagnostic\n")
print("="*70)

# 1. Check Ollama is running
print("\n1️⃣  Checking Ollama...")
try:
    result = subprocess.run(['ollama', 'ps'], capture_output=True, text=True, timeout=5)
    if result.returncode == 0:
        print("   βœ… Ollama is running")
        if result.stdout.strip():
            print(f"   πŸ“Š Loaded models:\n{result.stdout}")
        else:
            print("   ⚠️  No models loaded in memory!")
            print("      Model will be loaded on first request (adds 5-10s delay)")
    else:
        print(f"   ❌ Ollama error: {result.stderr}")
except FileNotFoundError:
    print("   ❌ Ollama not found! Install from https://ollama.ai")
    sys.exit(1)
except subprocess.TimeoutExpired:
    print("   ❌ Ollama not responding")
    sys.exit(1)

# 2. Test inference speed
print("\n2️⃣  Testing inference speed...")
model = "hf.co/Chun121/Qwen3-4B-RPG-Roleplay-V2:Q4_K_M"
test_prompt = "You are a DM. The player attacks. Respond in 1 sentence."

print(f"   Testing with: {model}")
print(f"   Prompt: '{test_prompt}'")

start = time.time()
try:
    result = subprocess.run(
        ['ollama', 'run', model, test_prompt],
        capture_output=True,
        text=True,
        timeout=30
    )
    elapsed = time.time() - start
    
    if result.returncode == 0:
        print(f"   βœ… Response in {elapsed:.1f}s")
        if elapsed < 5:
            print("      πŸš€ Fast! System is healthy.")
        elif elapsed < 15:
            print("      ⚠️  Acceptable but could be faster.")
        else:
            print("      πŸ”₯ Slow! This will cause timeouts in long sessions.")
        print(f"   πŸ“ Response: {result.stdout[:100]}...")
    else:
        print(f"   ❌ Error: {result.stderr}")
except subprocess.TimeoutExpired:
    print(f"   ❌ TIMEOUT after 30s! Your system is too slow.")
    print("      Possible causes:")
    print("      - CPU too slow")
    print("      - RAM/swap thrashing")
    print("      - Model too large for available RAM")

# 3. Check context window settings
print("\n3️⃣  Checking context window settings...")
from dnd_rag_system.config import settings

print(f"   MAX_MESSAGE_HISTORY: {settings.MAX_MESSAGE_HISTORY}")
print(f"   RECENT_MESSAGES_FOR_PROMPT: {settings.RECENT_MESSAGES_FOR_PROMPT}")
print(f"   OLLAMA_TIMEOUT: {settings.OLLAMA_TIMEOUT}s")

if settings.MAX_MESSAGE_HISTORY <= 20:
    print("   βœ… Pruning is configured correctly")
else:
    print(f"   ⚠️  MAX_MESSAGE_HISTORY is {settings.MAX_MESSAGE_HISTORY} (high!)")
    print("      Consider lowering to 15-20")

# 4. Test GameMaster initialization
print("\n4️⃣  Testing GameMaster initialization...")
try:
    from dnd_rag_system.core.chroma_manager import ChromaDBManager
    from dnd_rag_system.systems.gm_dialogue_unified import GameMaster
    
    db = ChromaDBManager()
    gm = GameMaster(db)
    
    print("   βœ… GameMaster initialized")
    print(f"   Message history: {len(gm.message_history)}")
    print(f"   Has _prune_message_history: {hasattr(gm, '_prune_message_history')}")
    print(f"   Has conversation_summary: {hasattr(gm, 'conversation_summary')}")
    
except Exception as e:
    print(f"   ❌ Error: {e}")

# 5. Estimate typical context size
print("\n5️⃣  Estimating context size...")
estimated_tokens = (
    100 +  # System prompt
    50 +   # Location/time
    75 +   # Party status
    200 +  # Recent 8 messages
    100    # Instructions
)
print(f"   Estimated: ~{estimated_tokens} tokens per request")
if estimated_tokens < 1000:
    print("   βœ… Context is small - should be fast!")
else:
    print(f"   ⚠️  Context is {estimated_tokens} tokens - may be slow")

# 6. Final verdict
print("\n" + "="*70)
print("πŸ“Š DIAGNOSTIC SUMMARY")
print("="*70)
print()
print("If inference test was < 5s: βœ… System is healthy")
print("If inference test was 5-15s: ⚠️  Borderline - may timeout after 20+ turns")
print("If inference test was > 15s: πŸ”₯ Will definitely timeout - need faster hardware")
print()
print("Next steps:")
print("  1. If Ollama test was slow, try a smaller model:")
print("     ollama pull qwen2.5:3b")
print("     (Then update OLLAMA_MODEL_NAME in settings.py)")
print()
print("  2. Restart Gradio to clear old sessions:")
print("     ./stop_gradio.sh && ./start_gradio.sh")
print()
print("  3. Run demo again and watch for 'Pruned' messages in logs")
print()

print("="*70)