Andrew McCracken Claude commited on
Commit
6e83384
·
1 Parent(s): 1b98923

Optimize model parameters for faster CPU inference

Browse files

- Reduced n_ctx from 4096 to 2048 for faster inference
- Reduced n_batch from 512 to 256 for lower memory usage
- Reduced n_threads from 8 to 4 for HF Spaces CPU limits
- Changed f16_kv to False for better CPU performance
- Reduced default max_tokens from 512 to 256 for faster responses

These changes need to be rebuilt into the base Docker image.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show
  1. llm_handler.py +4 -4
  2. main.py +1 -1
llm_handler.py CHANGED
@@ -51,12 +51,12 @@ class CybersecurityLLM:
51
  logger.info("Initializing model...")
52
  self.llm = Llama(
53
  model_path=model_path,
54
- n_ctx=4096, # Context window
55
- n_batch=512, # Batch size for prompt processing
56
- n_threads=8, # Adjust based on CPU cores
57
  n_gpu_layers=0, # CPU only
58
  seed=-1, # Random seed
59
- f16_kv=True, # Use f16 for key/value cache
60
  logits_all=False, # Only compute logits for last token
61
  vocab_only=False, # Load full model
62
  use_mmap=True, # Memory-map model for efficiency
 
51
  logger.info("Initializing model...")
52
  self.llm = Llama(
53
  model_path=model_path,
54
+ n_ctx=2048, # Reduced context window for faster inference
55
+ n_batch=256, # Smaller batch size for lower memory usage
56
+ n_threads=4, # Reduced threads for HF Spaces CPU limits
57
  n_gpu_layers=0, # CPU only
58
  seed=-1, # Random seed
59
+ f16_kv=False, # Use f32 for better CPU performance
60
  logits_all=False, # Only compute logits for last token
61
  vocab_only=False, # Load full model
62
  use_mmap=True, # Memory-map model for efficiency
main.py CHANGED
@@ -158,7 +158,7 @@ app.add_middleware(
158
  class ChatRequest(BaseModel):
159
  message: str = Field(..., description="User's security question")
160
  session_id: Optional[str] = Field(None, description="Session ID for conversation continuity")
161
- max_tokens: Optional[int] = Field(512, description="Maximum response length")
162
  temperature: Optional[float] = Field(0.7, description="Response creativity (0-1)")
163
  use_rag: Optional[bool] = Field(True, description="Use RAG for enhanced accuracy")
164
  use_cache: Optional[bool] = Field(True, description="Use cached responses if available")
 
158
  class ChatRequest(BaseModel):
159
  message: str = Field(..., description="User's security question")
160
  session_id: Optional[str] = Field(None, description="Session ID for conversation continuity")
161
+ max_tokens: Optional[int] = Field(256, description="Maximum response length")
162
  temperature: Optional[float] = Field(0.7, description="Response creativity (0-1)")
163
  use_rag: Optional[bool] = Field(True, description="Use RAG for enhanced accuracy")
164
  use_cache: Optional[bool] = Field(True, description="Use cached responses if available")