Spaces:

tech-daskalos
/

CyberSecChatbot

Paused

Andrew McCracken Claude commited on Oct 14, 2025

Commit

efd4459

1 Parent(s): bfa102d

Add concurrent request handling with model pool

Implemented ModelPool for true concurrent processing:
- Created ModelPool class with thread-safe queue
- Initializes 10 model instances (configurable via MODEL_POOL_SIZE)
- Each instance can handle one request simultaneously
- Automatic model checkout/return from pool
- Added pool statistics to /health endpoint

Configuration:
- MODEL_POOL_SIZE=10 (supports 10 concurrent users)
- 60s timeout if all instances busy
- Each model instance ~2.4GB VRAM
- Total VRAM: ~24GB for 10 instances (fits in 48GB GPU)

Sessions are handled via session_id parameter (already present)
Pool automatically balances load across instances

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

Dockerfile.gpu +5 -2
main.py +111 -18

Dockerfile.gpu CHANGED Viewed

@@ -1,6 +1,6 @@
 # Use pre-built GPU image from Docker Hub
-# Build this image locally with: docker buildx build --platform linux/amd64 -f Dockerfile.base.gpu -t techdaskalos/cybersecchatbot:gpu . --push
-FROM techdaskalos/cybersecchatbot:gpu
 # Environment variables (already set in base image, but can override)
 ENV PYTHONUNBUFFERED=1
@@ -12,6 +12,9 @@ ENV CACHE_ENABLED=true
 # GPU configuration - offload all layers to GPU
 ENV N_GPU_LAYERS=35
 # Set Hugging Face cache to /data for persistence and write permissions
 ENV HF_HOME=/data/huggingface

 # Use pre-built GPU image from Docker Hub
+# Build this image locally with: docker buildx build --platform linux/amd64 -f Dockerfile.base.gpu -t techdaskalos/cybersecchatbot:latest-gpu . --push
+FROM techdaskalos/cybersecchatbot:latest-gpu
 # Environment variables (already set in base image, but can override)
 ENV PYTHONUNBUFFERED=1
 # GPU configuration - offload all layers to GPU
 ENV N_GPU_LAYERS=35
+# Concurrent request handling - 10 model instances for 10 concurrent users
+ENV MODEL_POOL_SIZE=10
 # Set Hugging Face cache to /data for persistence and write permissions
 ENV HF_HOME=/data/huggingface

main.py CHANGED Viewed

@@ -10,22 +10,98 @@ import uuid
 import os
 import sqlite3
 from contextlib import asynccontextmanager
 # Import our handlers
 from llm_handler import CybersecurityLLM
 from knowledge_base import RAGCybersecurityLLM
 from optimisations import PerformanceOptimizer, MemoryManager
 # Configuration from environment variables
 MODEL_REPO = os.getenv("MODEL_REPO", "daskalos-apps/phi4-cybersec-Q4_K_M")
 MODEL_FILENAME = os.getenv("MODEL_FILENAME", "phi4-mini-instruct-Q4_K_M.gguf")
 USE_RAG = os.getenv("USE_RAG", "true").lower() == "true"
 CACHE_ENABLED = os.getenv("CACHE_ENABLED", "true").lower() == "true"
 # Global instances
 llm_instance = None
 optimizer = None
 memory_manager = None
 # Database setup
 # Support multiple deployment platforms: /data (HF Spaces), /app/data (Render/Railway), or local
@@ -94,26 +170,31 @@ def log_interaction(session_id: str, message: str, response_length: int):
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Startup and shutdown events"""
-    global llm_instance, optimizer, memory_manager
     # Startup
     print(f"🚀 Loading model from Hugging Face: {MODEL_REPO}")
     # Initialize database
     init_db()
     print("✅ Database initialized")
     try:
-        if USE_RAG:
-            llm_instance = RAGCybersecurityLLM(
-                repo_id=MODEL_REPO,
-                filename=MODEL_FILENAME
-            )
-        else:
-            llm_instance = CybersecurityLLM(
-                repo_id=MODEL_REPO,
-                filename=MODEL_FILENAME
-            )
         if CACHE_ENABLED:
             optimizer = PerformanceOptimizer()
@@ -125,6 +206,7 @@ async def lifespan(app: FastAPI):
         print(f"💾 Size: {llm_instance.get_model_info()['size_mb']:.2f} MB")
         print(f"🔧 RAG: {'Enabled' if USE_RAG else 'Disabled'}")
         print(f"⚡ Cache: {'Enabled' if CACHE_ENABLED else 'Disabled'}")
     except Exception as e:
         print(f"❌ Failed to load model: {e}")
@@ -204,6 +286,7 @@ async def health_check():
         raise HTTPException(status_code=503, detail="Model not loaded")
     memory_status = memory_manager.check_memory() if memory_manager else {}
     return {
         "status": "healthy",
@@ -211,7 +294,8 @@ async def health_check():
         "version": "2.0.0",
         "memory": memory_status,
         "cache_enabled": CACHE_ENABLED,
-        "rag_enabled": USE_RAG
     }
@@ -317,23 +401,28 @@ async def chat(request: ChatRequest):
 @app.post("/chat/stream")
 async def chat_stream(request: ChatRequest):
-    """Streaming chat endpoint"""
-    if llm_instance is None:
-        raise HTTPException(status_code=503, detail="Model not loaded")
     # Track interaction
     count = increment_interaction()
     session_id = request.session_id or str(uuid.uuid4())
     async def generate():
         try:
             full_response = ""
-            # Send initial metadata
-            yield f"data: {json.dumps({'type': 'start', 'session_id': session_id, 'model': MODEL_REPO, 'interaction_count': count})}\n\n"
             # Stream tokens
-            for token in llm_instance.generate_stream(
                     request.message,
                     max_tokens=request.max_tokens
             ):
@@ -348,6 +437,10 @@ async def chat_stream(request: ChatRequest):
         except Exception as e:
             yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
     return StreamingResponse(generate(), media_type="text/event-stream")

 import os
 import sqlite3
 from contextlib import asynccontextmanager
+import queue
+import threading
 # Import our handlers
 from llm_handler import CybersecurityLLM
 from knowledge_base import RAGCybersecurityLLM
 from optimisations import PerformanceOptimizer, MemoryManager
+class ModelPool:
+    """Thread-safe pool of model instances for concurrent request handling"""
+    def __init__(self, pool_size: int, model_class, **model_kwargs):
+        """
+        Initialize a pool of model instances
+        Args:
+            pool_size: Number of model instances to create
+            model_class: The model class to instantiate (CybersecurityLLM or RAGCybersecurityLLM)
+            **model_kwargs: Arguments to pass to each model instance
+        """
+        self.pool_size = pool_size
+        self.model_class = model_class
+        self.model_kwargs = model_kwargs
+        self.pool = queue.Queue(maxsize=pool_size)
+        self.lock = threading.Lock()
+        self._initialize_pool()
+    def _initialize_pool(self):
+        """Create and add model instances to the pool"""
+        print(f"🔄 Initializing model pool with {self.pool_size} instances...")
+        for i in range(self.pool_size):
+            print(f"   Loading model instance {i + 1}/{self.pool_size}...")
+            model = self.model_class(**self.model_kwargs)
+            self.pool.put(model)
+        print(f"✅ Model pool ready with {self.pool_size} instances")
+    async def get_model(self, timeout: float = 30.0):
+        """
+        Get an available model from the pool (async)
+        Args:
+            timeout: Maximum time to wait for an available model
+        Returns:
+            Model instance
+        Raises:
+            HTTPException: If no model available within timeout
+        """
+        start_time = asyncio.get_event_loop().time()
+        while True:
+            try:
+                # Try to get a model without blocking
+                model = self.pool.get_nowait()
+                return model
+            except queue.Empty:
+                # Check timeout
+                if asyncio.get_event_loop().time() - start_time > timeout:
+                    raise HTTPException(
+                        status_code=503,
+                        detail=f"All {self.pool_size} model instances are busy. Please try again later."
+                    )
+                # Wait a bit before trying again
+                await asyncio.sleep(0.1)
+    def return_model(self, model):
+        """Return a model to the pool"""
+        self.pool.put(model)
+    def get_stats(self) -> Dict[str, Any]:
+        """Get pool statistics"""
+        return {
+            "pool_size": self.pool_size,
+            "available": self.pool.qsize(),
+            "in_use": self.pool_size - self.pool.qsize()
+        }
 # Configuration from environment variables
 MODEL_REPO = os.getenv("MODEL_REPO", "daskalos-apps/phi4-cybersec-Q4_K_M")
 MODEL_FILENAME = os.getenv("MODEL_FILENAME", "phi4-mini-instruct-Q4_K_M.gguf")
 USE_RAG = os.getenv("USE_RAG", "true").lower() == "true"
 CACHE_ENABLED = os.getenv("CACHE_ENABLED", "true").lower() == "true"
+MODEL_POOL_SIZE = int(os.getenv("MODEL_POOL_SIZE", "10"))  # Number of concurrent model instances
 # Global instances
 llm_instance = None
 optimizer = None
 memory_manager = None
+model_pool = None  # Pool of model instances for concurrent processing
 # Database setup
 # Support multiple deployment platforms: /data (HF Spaces), /app/data (Render/Railway), or local
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Startup and shutdown events"""
+    global llm_instance, optimizer, memory_manager, model_pool
     # Startup
     print(f"🚀 Loading model from Hugging Face: {MODEL_REPO}")
+    print(f"📊 Concurrent instances: {MODEL_POOL_SIZE}")
     # Initialize database
     init_db()
     print("✅ Database initialized")
     try:
+        # Initialize model pool for concurrent requests
+        model_class = RAGCybersecurityLLM if USE_RAG else CybersecurityLLM
+        model_pool = ModelPool(
+            pool_size=MODEL_POOL_SIZE,
+            model_class=model_class,
+            repo_id=MODEL_REPO,
+            filename=MODEL_FILENAME
+        )
+        # Keep one instance for backward compatibility (health checks, etc.)
+        llm_instance = model_class(
+            repo_id=MODEL_REPO,
+            filename=MODEL_FILENAME
+        )
         if CACHE_ENABLED:
             optimizer = PerformanceOptimizer()
         print(f"💾 Size: {llm_instance.get_model_info()['size_mb']:.2f} MB")
         print(f"🔧 RAG: {'Enabled' if USE_RAG else 'Disabled'}")
         print(f"⚡ Cache: {'Enabled' if CACHE_ENABLED else 'Disabled'}")
+        print(f"👥 Concurrent capacity: {MODEL_POOL_SIZE} users")
     except Exception as e:
         print(f"❌ Failed to load model: {e}")
         raise HTTPException(status_code=503, detail="Model not loaded")
     memory_status = memory_manager.check_memory() if memory_manager else {}
+    pool_status = model_pool.get_stats() if model_pool else {"pool_size": 0, "available": 0, "in_use": 0}
     return {
         "status": "healthy",
         "version": "2.0.0",
         "memory": memory_status,
         "cache_enabled": CACHE_ENABLED,
+        "rag_enabled": USE_RAG,
+        "concurrent_capacity": pool_status
     }
 @app.post("/chat/stream")
 async def chat_stream(request: ChatRequest):
+    """Streaming chat endpoint with concurrent request support"""
+    if model_pool is None:
+        raise HTTPException(status_code=503, detail="Model pool not initialized")
     # Track interaction
     count = increment_interaction()
     session_id = request.session_id or str(uuid.uuid4())
     async def generate():
+        model = None
         try:
             full_response = ""
+            # Get a model from the pool (will wait if all busy)
+            model = await model_pool.get_model(timeout=60.0)
+            # Send initial metadata with pool stats
+            pool_stats = model_pool.get_stats()
+            yield f"data: {json.dumps({{'type': 'start', 'session_id': session_id, 'model': MODEL_REPO, 'interaction_count': count, 'pool_available': pool_stats['available']})}\n\n"
             # Stream tokens
+            for token in model.generate_stream(
                     request.message,
                     max_tokens=request.max_tokens
             ):
         except Exception as e:
             yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
+        finally:
+            # Always return the model to the pool
+            if model is not None:
+                model_pool.return_model(model)
     return StreamingResponse(generate(), media_type="text/event-stream")