Spaces:

tech-daskalos
/

CyberSecChatbot

Paused

Andrew McCracken Claude commited on Oct 13, 2025

Commit

8cfe5b7

1 Parent(s): 3f2ee19

Optimize for faster inference

Performance optimizations:
1. Reduced max_tokens from 512 to 256 (faster responses)
2. Reduced n_ctx from 4096 to 2048 (faster prompt processing)
3. Added token buffering in streaming (better perceived speed)
- Buffers 3 tokens or until whitespace
- Reduces network overhead

Expected speedup: 15s → 8-10s per response

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

llm_handler.py +1 -1
main.py +15 -4

llm_handler.py CHANGED Viewed

@@ -51,7 +51,7 @@ class CybersecurityLLM:
         logger.info("Initializing model...")
         self.llm = Llama(
             model_path=model_path,
-            n_ctx=4096,  # Context window
             n_batch=512,  # Batch size for prompt processing
             n_threads=8,  # Use all 8 vCPUs for maximum inference speed
             n_gpu_layers=0,  # CPU only

         logger.info("Initializing model...")
         self.llm = Llama(
             model_path=model_path,
+            n_ctx=2048,  # Reduced context window for faster prompt processing
             n_batch=512,  # Batch size for prompt processing
             n_threads=8,  # Use all 8 vCPUs for maximum inference speed
             n_gpu_layers=0,  # CPU only

main.py CHANGED Viewed

@@ -158,7 +158,7 @@ app.add_middleware(
 class ChatRequest(BaseModel):
     message: str = Field(..., description="User's security question")
     session_id: Optional[str] = Field(None, description="Session ID for conversation continuity")
-    max_tokens: Optional[int] = Field(512, description="Maximum response length")
     temperature: Optional[float] = Field(0.7, description="Response creativity (0-1)")
     use_rag: Optional[bool] = Field(True, description="Use RAG for enhanced accuracy")
     use_cache: Optional[bool] = Field(True, description="Use cached responses if available")
@@ -328,18 +328,29 @@ async def chat_stream(request: ChatRequest):
     async def generate():
         try:
             full_response = ""
             # Send initial metadata
             yield f"data: {json.dumps({'type': 'start', 'session_id': session_id, 'model': MODEL_REPO, 'interaction_count': count})}\n\n"
-            # Stream tokens
             for token in llm_instance.generate_stream(
                     request.message,
                     max_tokens=request.max_tokens
             ):
                 full_response += token
-                yield f"data: {json.dumps({'type': 'token', 'content': token})}\n\n"
-                await asyncio.sleep(0)
             # Log interaction
             log_interaction(session_id, request.message, len(full_response))

 class ChatRequest(BaseModel):
     message: str = Field(..., description="User's security question")
     session_id: Optional[str] = Field(None, description="Session ID for conversation continuity")
+    max_tokens: Optional[int] = Field(256, description="Maximum response length")
     temperature: Optional[float] = Field(0.7, description="Response creativity (0-1)")
     use_rag: Optional[bool] = Field(True, description="Use RAG for enhanced accuracy")
     use_cache: Optional[bool] = Field(True, description="Use cached responses if available")
     async def generate():
         try:
             full_response = ""
+            buffer = ""
+            buffer_size = 3  # Send every 3 tokens for better perceived speed
             # Send initial metadata
             yield f"data: {json.dumps({'type': 'start', 'session_id': session_id, 'model': MODEL_REPO, 'interaction_count': count})}\n\n"
+            # Stream tokens with buffering
             for token in llm_instance.generate_stream(
                     request.message,
                     max_tokens=request.max_tokens
             ):
                 full_response += token
+                buffer += token
+                # Send buffer when it reaches buffer_size or contains whitespace
+                if len(buffer) >= buffer_size or ' ' in token or '\n' in token:
+                    yield f"data: {json.dumps({'type': 'token', 'content': buffer})}\n\n"
+                    buffer = ""
+                    await asyncio.sleep(0)
+            # Send any remaining buffered tokens
+            if buffer:
+                yield f"data: {json.dumps({'type': 'token', 'content': buffer})}\n\n"
             # Log interaction
             log_interaction(session_id, request.message, len(full_response))