Andrew McCracken
Claude
commited on
Commit
·
b7fb901
1
Parent(s):
6e83384
Optimize for 8vCPU/32GB instance
Browse files- Use 6 threads (leave 2 for system/API)
- Restore n_ctx=4096 and n_batch=512 for better quality
- Restore max_tokens=512 for complete responses
- Enable verbose logging for debugging
- f16_kv=True for memory efficiency with sufficient RAM
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
- llm_handler.py +5 -5
- main.py +1 -1
llm_handler.py
CHANGED
|
@@ -51,17 +51,17 @@ class CybersecurityLLM:
|
|
| 51 |
logger.info("Initializing model...")
|
| 52 |
self.llm = Llama(
|
| 53 |
model_path=model_path,
|
| 54 |
-
n_ctx=
|
| 55 |
-
n_batch=
|
| 56 |
-
n_threads=
|
| 57 |
n_gpu_layers=0, # CPU only
|
| 58 |
seed=-1, # Random seed
|
| 59 |
-
f16_kv=
|
| 60 |
logits_all=False, # Only compute logits for last token
|
| 61 |
vocab_only=False, # Load full model
|
| 62 |
use_mmap=True, # Memory-map model for efficiency
|
| 63 |
use_mlock=False, # Don't lock model in RAM
|
| 64 |
-
verbose=
|
| 65 |
)
|
| 66 |
|
| 67 |
# Store model info
|
|
|
|
| 51 |
logger.info("Initializing model...")
|
| 52 |
self.llm = Llama(
|
| 53 |
model_path=model_path,
|
| 54 |
+
n_ctx=4096, # Context window
|
| 55 |
+
n_batch=512, # Batch size for prompt processing
|
| 56 |
+
n_threads=6, # Use 6 of 8 vCPUs (leave 2 for system/API)
|
| 57 |
n_gpu_layers=0, # CPU only
|
| 58 |
seed=-1, # Random seed
|
| 59 |
+
f16_kv=True, # Use f16 for key/value cache (saves memory)
|
| 60 |
logits_all=False, # Only compute logits for last token
|
| 61 |
vocab_only=False, # Load full model
|
| 62 |
use_mmap=True, # Memory-map model for efficiency
|
| 63 |
use_mlock=False, # Don't lock model in RAM
|
| 64 |
+
verbose=True # Enable verbose for debugging
|
| 65 |
)
|
| 66 |
|
| 67 |
# Store model info
|
main.py
CHANGED
|
@@ -158,7 +158,7 @@ app.add_middleware(
|
|
| 158 |
class ChatRequest(BaseModel):
|
| 159 |
message: str = Field(..., description="User's security question")
|
| 160 |
session_id: Optional[str] = Field(None, description="Session ID for conversation continuity")
|
| 161 |
-
max_tokens: Optional[int] = Field(
|
| 162 |
temperature: Optional[float] = Field(0.7, description="Response creativity (0-1)")
|
| 163 |
use_rag: Optional[bool] = Field(True, description="Use RAG for enhanced accuracy")
|
| 164 |
use_cache: Optional[bool] = Field(True, description="Use cached responses if available")
|
|
|
|
| 158 |
class ChatRequest(BaseModel):
|
| 159 |
message: str = Field(..., description="User's security question")
|
| 160 |
session_id: Optional[str] = Field(None, description="Session ID for conversation continuity")
|
| 161 |
+
max_tokens: Optional[int] = Field(512, description="Maximum response length")
|
| 162 |
temperature: Optional[float] = Field(0.7, description="Response creativity (0-1)")
|
| 163 |
use_rag: Optional[bool] = Field(True, description="Use RAG for enhanced accuracy")
|
| 164 |
use_cache: Optional[bool] = Field(True, description="Use cached responses if available")
|