Andrew McCracken Claude commited on
Commit
b7fb901
·
1 Parent(s): 6e83384

Optimize for 8vCPU/32GB instance

Browse files

- Use 6 threads (leave 2 for system/API)
- Restore n_ctx=4096 and n_batch=512 for better quality
- Restore max_tokens=512 for complete responses
- Enable verbose logging for debugging
- f16_kv=True for memory efficiency with sufficient RAM

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show
  1. llm_handler.py +5 -5
  2. main.py +1 -1
llm_handler.py CHANGED
@@ -51,17 +51,17 @@ class CybersecurityLLM:
51
  logger.info("Initializing model...")
52
  self.llm = Llama(
53
  model_path=model_path,
54
- n_ctx=2048, # Reduced context window for faster inference
55
- n_batch=256, # Smaller batch size for lower memory usage
56
- n_threads=4, # Reduced threads for HF Spaces CPU limits
57
  n_gpu_layers=0, # CPU only
58
  seed=-1, # Random seed
59
- f16_kv=False, # Use f32 for better CPU performance
60
  logits_all=False, # Only compute logits for last token
61
  vocab_only=False, # Load full model
62
  use_mmap=True, # Memory-map model for efficiency
63
  use_mlock=False, # Don't lock model in RAM
64
- verbose=False
65
  )
66
 
67
  # Store model info
 
51
  logger.info("Initializing model...")
52
  self.llm = Llama(
53
  model_path=model_path,
54
+ n_ctx=4096, # Context window
55
+ n_batch=512, # Batch size for prompt processing
56
+ n_threads=6, # Use 6 of 8 vCPUs (leave 2 for system/API)
57
  n_gpu_layers=0, # CPU only
58
  seed=-1, # Random seed
59
+ f16_kv=True, # Use f16 for key/value cache (saves memory)
60
  logits_all=False, # Only compute logits for last token
61
  vocab_only=False, # Load full model
62
  use_mmap=True, # Memory-map model for efficiency
63
  use_mlock=False, # Don't lock model in RAM
64
+ verbose=True # Enable verbose for debugging
65
  )
66
 
67
  # Store model info
main.py CHANGED
@@ -158,7 +158,7 @@ app.add_middleware(
158
  class ChatRequest(BaseModel):
159
  message: str = Field(..., description="User's security question")
160
  session_id: Optional[str] = Field(None, description="Session ID for conversation continuity")
161
- max_tokens: Optional[int] = Field(256, description="Maximum response length")
162
  temperature: Optional[float] = Field(0.7, description="Response creativity (0-1)")
163
  use_rag: Optional[bool] = Field(True, description="Use RAG for enhanced accuracy")
164
  use_cache: Optional[bool] = Field(True, description="Use cached responses if available")
 
158
  class ChatRequest(BaseModel):
159
  message: str = Field(..., description="User's security question")
160
  session_id: Optional[str] = Field(None, description="Session ID for conversation continuity")
161
+ max_tokens: Optional[int] = Field(512, description="Maximum response length")
162
  temperature: Optional[float] = Field(0.7, description="Response creativity (0-1)")
163
  use_rag: Optional[bool] = Field(True, description="Use RAG for enhanced accuracy")
164
  use_cache: Optional[bool] = Field(True, description="Use cached responses if available")