Andrew McCracken Claude commited on
Commit
457c9e1
·
1 Parent(s): 8cfe5b7

Revert to simpler configuration - optimizations caused slowdown

Browse files

- Reverted n_ctx back to 4096 (2048 was too small)
- Reverted streaming to simple token-by-token (buffering caused issues)
- Reduced threads to 6 (8 may have been overloading)
- Kept max_tokens at 256 (only change that should help)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show
  1. llm_handler.py +2 -2
  2. main.py +3 -14
llm_handler.py CHANGED
@@ -51,9 +51,9 @@ class CybersecurityLLM:
51
  logger.info("Initializing model...")
52
  self.llm = Llama(
53
  model_path=model_path,
54
- n_ctx=2048, # Reduced context window for faster prompt processing
55
  n_batch=512, # Batch size for prompt processing
56
- n_threads=8, # Use all 8 vCPUs for maximum inference speed
57
  n_gpu_layers=0, # CPU only
58
  seed=-1, # Random seed
59
  f16_kv=True, # Use f16 for key/value cache (saves memory)
 
51
  logger.info("Initializing model...")
52
  self.llm = Llama(
53
  model_path=model_path,
54
+ n_ctx=4096, # Context window
55
  n_batch=512, # Batch size for prompt processing
56
+ n_threads=6, # Use 6 of 8 vCPUs (leave 2 for system)
57
  n_gpu_layers=0, # CPU only
58
  seed=-1, # Random seed
59
  f16_kv=True, # Use f16 for key/value cache (saves memory)
main.py CHANGED
@@ -328,29 +328,18 @@ async def chat_stream(request: ChatRequest):
328
  async def generate():
329
  try:
330
  full_response = ""
331
- buffer = ""
332
- buffer_size = 3 # Send every 3 tokens for better perceived speed
333
 
334
  # Send initial metadata
335
  yield f"data: {json.dumps({'type': 'start', 'session_id': session_id, 'model': MODEL_REPO, 'interaction_count': count})}\n\n"
336
 
337
- # Stream tokens with buffering
338
  for token in llm_instance.generate_stream(
339
  request.message,
340
  max_tokens=request.max_tokens
341
  ):
342
  full_response += token
343
- buffer += token
344
-
345
- # Send buffer when it reaches buffer_size or contains whitespace
346
- if len(buffer) >= buffer_size or ' ' in token or '\n' in token:
347
- yield f"data: {json.dumps({'type': 'token', 'content': buffer})}\n\n"
348
- buffer = ""
349
- await asyncio.sleep(0)
350
-
351
- # Send any remaining buffered tokens
352
- if buffer:
353
- yield f"data: {json.dumps({'type': 'token', 'content': buffer})}\n\n"
354
 
355
  # Log interaction
356
  log_interaction(session_id, request.message, len(full_response))
 
328
  async def generate():
329
  try:
330
  full_response = ""
 
 
331
 
332
  # Send initial metadata
333
  yield f"data: {json.dumps({'type': 'start', 'session_id': session_id, 'model': MODEL_REPO, 'interaction_count': count})}\n\n"
334
 
335
+ # Stream tokens
336
  for token in llm_instance.generate_stream(
337
  request.message,
338
  max_tokens=request.max_tokens
339
  ):
340
  full_response += token
341
+ yield f"data: {json.dumps({'type': 'token', 'content': token})}\n\n"
342
+ await asyncio.sleep(0)
 
 
 
 
 
 
 
 
 
343
 
344
  # Log interaction
345
  log_interaction(session_id, request.message, len(full_response))