Andrew McCracken
Claude
commited on
Commit
·
6e83384
1
Parent(s):
1b98923
Optimize model parameters for faster CPU inference
Browse files- Reduced n_ctx from 4096 to 2048 for faster inference
- Reduced n_batch from 512 to 256 for lower memory usage
- Reduced n_threads from 8 to 4 for HF Spaces CPU limits
- Changed f16_kv to False for better CPU performance
- Reduced default max_tokens from 512 to 256 for faster responses
These changes need to be rebuilt into the base Docker image.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
- llm_handler.py +4 -4
- main.py +1 -1
llm_handler.py
CHANGED
|
@@ -51,12 +51,12 @@ class CybersecurityLLM:
|
|
| 51 |
logger.info("Initializing model...")
|
| 52 |
self.llm = Llama(
|
| 53 |
model_path=model_path,
|
| 54 |
-
n_ctx=
|
| 55 |
-
n_batch=
|
| 56 |
-
n_threads=
|
| 57 |
n_gpu_layers=0, # CPU only
|
| 58 |
seed=-1, # Random seed
|
| 59 |
-
f16_kv=
|
| 60 |
logits_all=False, # Only compute logits for last token
|
| 61 |
vocab_only=False, # Load full model
|
| 62 |
use_mmap=True, # Memory-map model for efficiency
|
|
|
|
| 51 |
logger.info("Initializing model...")
|
| 52 |
self.llm = Llama(
|
| 53 |
model_path=model_path,
|
| 54 |
+
n_ctx=2048, # Reduced context window for faster inference
|
| 55 |
+
n_batch=256, # Smaller batch size for lower memory usage
|
| 56 |
+
n_threads=4, # Reduced threads for HF Spaces CPU limits
|
| 57 |
n_gpu_layers=0, # CPU only
|
| 58 |
seed=-1, # Random seed
|
| 59 |
+
f16_kv=False, # Use f32 for better CPU performance
|
| 60 |
logits_all=False, # Only compute logits for last token
|
| 61 |
vocab_only=False, # Load full model
|
| 62 |
use_mmap=True, # Memory-map model for efficiency
|
main.py
CHANGED
|
@@ -158,7 +158,7 @@ app.add_middleware(
|
|
| 158 |
class ChatRequest(BaseModel):
|
| 159 |
message: str = Field(..., description="User's security question")
|
| 160 |
session_id: Optional[str] = Field(None, description="Session ID for conversation continuity")
|
| 161 |
-
max_tokens: Optional[int] = Field(
|
| 162 |
temperature: Optional[float] = Field(0.7, description="Response creativity (0-1)")
|
| 163 |
use_rag: Optional[bool] = Field(True, description="Use RAG for enhanced accuracy")
|
| 164 |
use_cache: Optional[bool] = Field(True, description="Use cached responses if available")
|
|
|
|
| 158 |
class ChatRequest(BaseModel):
|
| 159 |
message: str = Field(..., description="User's security question")
|
| 160 |
session_id: Optional[str] = Field(None, description="Session ID for conversation continuity")
|
| 161 |
+
max_tokens: Optional[int] = Field(256, description="Maximum response length")
|
| 162 |
temperature: Optional[float] = Field(0.7, description="Response creativity (0-1)")
|
| 163 |
use_rag: Optional[bool] = Field(True, description="Use RAG for enhanced accuracy")
|
| 164 |
use_cache: Optional[bool] = Field(True, description="Use cached responses if available")
|