Andrew McCracken
Claude
commited on
Commit
·
457c9e1
1
Parent(s):
8cfe5b7
Revert to simpler configuration - optimizations caused slowdown
Browse files- Reverted n_ctx back to 4096 (2048 was too small)
- Reverted streaming to simple token-by-token (buffering caused issues)
- Reduced threads to 6 (8 may have been overloading)
- Kept max_tokens at 256 (only change that should help)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
- llm_handler.py +2 -2
- main.py +3 -14
llm_handler.py
CHANGED
|
@@ -51,9 +51,9 @@ class CybersecurityLLM:
|
|
| 51 |
logger.info("Initializing model...")
|
| 52 |
self.llm = Llama(
|
| 53 |
model_path=model_path,
|
| 54 |
-
n_ctx=
|
| 55 |
n_batch=512, # Batch size for prompt processing
|
| 56 |
-
n_threads=
|
| 57 |
n_gpu_layers=0, # CPU only
|
| 58 |
seed=-1, # Random seed
|
| 59 |
f16_kv=True, # Use f16 for key/value cache (saves memory)
|
|
|
|
| 51 |
logger.info("Initializing model...")
|
| 52 |
self.llm = Llama(
|
| 53 |
model_path=model_path,
|
| 54 |
+
n_ctx=4096, # Context window
|
| 55 |
n_batch=512, # Batch size for prompt processing
|
| 56 |
+
n_threads=6, # Use 6 of 8 vCPUs (leave 2 for system)
|
| 57 |
n_gpu_layers=0, # CPU only
|
| 58 |
seed=-1, # Random seed
|
| 59 |
f16_kv=True, # Use f16 for key/value cache (saves memory)
|
main.py
CHANGED
|
@@ -328,29 +328,18 @@ async def chat_stream(request: ChatRequest):
|
|
| 328 |
async def generate():
|
| 329 |
try:
|
| 330 |
full_response = ""
|
| 331 |
-
buffer = ""
|
| 332 |
-
buffer_size = 3 # Send every 3 tokens for better perceived speed
|
| 333 |
|
| 334 |
# Send initial metadata
|
| 335 |
yield f"data: {json.dumps({'type': 'start', 'session_id': session_id, 'model': MODEL_REPO, 'interaction_count': count})}\n\n"
|
| 336 |
|
| 337 |
-
# Stream tokens
|
| 338 |
for token in llm_instance.generate_stream(
|
| 339 |
request.message,
|
| 340 |
max_tokens=request.max_tokens
|
| 341 |
):
|
| 342 |
full_response += token
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
# Send buffer when it reaches buffer_size or contains whitespace
|
| 346 |
-
if len(buffer) >= buffer_size or ' ' in token or '\n' in token:
|
| 347 |
-
yield f"data: {json.dumps({'type': 'token', 'content': buffer})}\n\n"
|
| 348 |
-
buffer = ""
|
| 349 |
-
await asyncio.sleep(0)
|
| 350 |
-
|
| 351 |
-
# Send any remaining buffered tokens
|
| 352 |
-
if buffer:
|
| 353 |
-
yield f"data: {json.dumps({'type': 'token', 'content': buffer})}\n\n"
|
| 354 |
|
| 355 |
# Log interaction
|
| 356 |
log_interaction(session_id, request.message, len(full_response))
|
|
|
|
| 328 |
async def generate():
|
| 329 |
try:
|
| 330 |
full_response = ""
|
|
|
|
|
|
|
| 331 |
|
| 332 |
# Send initial metadata
|
| 333 |
yield f"data: {json.dumps({'type': 'start', 'session_id': session_id, 'model': MODEL_REPO, 'interaction_count': count})}\n\n"
|
| 334 |
|
| 335 |
+
# Stream tokens
|
| 336 |
for token in llm_instance.generate_stream(
|
| 337 |
request.message,
|
| 338 |
max_tokens=request.max_tokens
|
| 339 |
):
|
| 340 |
full_response += token
|
| 341 |
+
yield f"data: {json.dumps({'type': 'token', 'content': token})}\n\n"
|
| 342 |
+
await asyncio.sleep(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
# Log interaction
|
| 345 |
log_interaction(session_id, request.message, len(full_response))
|