Andrew McCracken Claude commited on
Commit
3f2ee19
·
1 Parent(s): 2a55dc3

Increase threads to 8 for faster inference

Browse files

- Use all 8 vCPUs for maximum inference speed
- Should reduce response time from ~15s to ~10-12s

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show
  1. llm_handler.py +1 -1
llm_handler.py CHANGED
@@ -53,7 +53,7 @@ class CybersecurityLLM:
53
  model_path=model_path,
54
  n_ctx=4096, # Context window
55
  n_batch=512, # Batch size for prompt processing
56
- n_threads=6, # Use 6 of 8 vCPUs (leave 2 for system/API)
57
  n_gpu_layers=0, # CPU only
58
  seed=-1, # Random seed
59
  f16_kv=True, # Use f16 for key/value cache (saves memory)
 
53
  model_path=model_path,
54
  n_ctx=4096, # Context window
55
  n_batch=512, # Batch size for prompt processing
56
+ n_threads=8, # Use all 8 vCPUs for maximum inference speed
57
  n_gpu_layers=0, # CPU only
58
  seed=-1, # Random seed
59
  f16_kv=True, # Use f16 for key/value cache (saves memory)