Spaces:

tech-daskalos
/

CyberSecChatbot

Paused

Andrew McCracken Claude commited on Oct 14, 2025

Commit

bfa102d

1 Parent(s): 457c9e1

Add GPU support

Added GPU-enabled Docker configuration:
- Dockerfile.base.gpu: CUDA 12.1 base with llama-cpp-python GPU support
- Dockerfile.gpu: HF Spaces GPU deployment dockerfile
- build-and-push-gpu.sh: Script to build and push GPU image
- Updated llm_handler.py to use N_GPU_LAYERS env variable

To use GPU:
1. Build: ./build-and-push-gpu.sh
2. Switch HF Space to GPU hardware
3. Use Dockerfile.gpu for deployment

Expected speedup: ~15s → 1-3s per response

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (4) hide show

Dockerfile.base.gpu +52 -0
Dockerfile.gpu +26 -0
build-and-push-gpu.sh +64 -0
llm_handler.py +11 -2

Dockerfile.base.gpu ADDED Viewed

	@@ -0,0 +1,52 @@

+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
+WORKDIR /app
+# Install Python and system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.11 \
+    python3.11-dev \
+    python3-pip \
+    build-essential \
+    cmake \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Set Python 3.11 as default
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
+# Upgrade pip
+RUN python -m pip install --upgrade pip
+# Copy requirements and install
+COPY requirements.txt .
+# Install llama-cpp-python with CUDA support
+RUN CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python --no-cache-dir
+# Install remaining dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create data directory for persistence
+RUN mkdir -p /data
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV MODEL_REPO=daskalos-apps/phi4-cybersec-Q4_K_M
+ENV MODEL_FILENAME=phi4-mini-instruct-Q4_K_M.gguf
+ENV USE_RAG=false
+ENV CACHE_ENABLED=true
+# Expose port
+EXPOSE 8000
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+  CMD python -c "import requests; requests.get('http://localhost:8000/health')"
+# Run the application
+CMD ["python", "main.py"]

Dockerfile.gpu ADDED Viewed

	@@ -0,0 +1,26 @@

+# Use pre-built GPU image from Docker Hub
+# Build this image locally with: docker buildx build --platform linux/amd64 -f Dockerfile.base.gpu -t techdaskalos/cybersecchatbot:gpu . --push
+FROM techdaskalos/cybersecchatbot:gpu
+# Environment variables (already set in base image, but can override)
+ENV PYTHONUNBUFFERED=1
+ENV MODEL_REPO=daskalos-apps/phi4-cybersec-Q4_K_M
+ENV MODEL_FILENAME=phi4-mini-instruct-Q4_K_M.gguf
+ENV USE_RAG=false
+ENV CACHE_ENABLED=true
+# GPU configuration - offload all layers to GPU
+ENV N_GPU_LAYERS=35
+# Set Hugging Face cache to /data for persistence and write permissions
+ENV HF_HOME=/data/huggingface
+# Ensure all required directories exist and are writable
+RUN mkdir -p /data /app/models /app/knowledge_db /data/huggingface/hub /data/huggingface/transformers && \
+    chmod -R 777 /data /app/models /app/knowledge_db
+# Copy test interface (needed for /test endpoint)
+COPY test_interface.html /app/
+EXPOSE 8000
+CMD ["python", "main.py"]

build-and-push-gpu.sh ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/bin/bash
+set -e
+# Configuration
+DOCKER_USERNAME="techdaskalos"
+IMAGE_NAME="cybersecchatbot"
+VERSION="${1:-gpu}"
+FULL_IMAGE="$DOCKER_USERNAME/$IMAGE_NAME:$VERSION"
+echo "🏗️  Building GPU Docker image: $FULL_IMAGE"
+echo "================================"
+# Build the image for GPU
+docker buildx build --platform linux/amd64 -f Dockerfile.base.gpu -t "$FULL_IMAGE" .
+echo ""
+echo "✅ Build complete!"
+echo ""
+echo "🧪 Testing the image locally (requires NVIDIA GPU)..."
+echo "   Run: docker run --gpus all -p 8000:8000 $FULL_IMAGE"
+echo "   Then visit: http://localhost:8000/test"
+echo ""
+read -p "Would you like to test locally before pushing? (y/n) " -n 1 -r
+echo
+if [[ $REPLY =~ ^[Yy]$ ]]; then
+    echo "Starting local test server with GPU support..."
+    echo "Press Ctrl+C to stop when done testing"
+    docker run --gpus all -p 8000:8000 "$FULL_IMAGE"
+fi
+echo ""
+read -p "Push to Docker Hub? (y/n) " -n 1 -r
+echo
+if [[ $REPLY =~ ^[Yy]$ ]]; then
+    echo "📤 Pushing to Docker Hub..."
+    # Check if logged in
+    if ! docker info | grep -q "Username: $DOCKER_USERNAME"; then
+        echo "Please login to Docker Hub:"
+        docker login
+    fi
+    docker push "$FULL_IMAGE"
+    echo ""
+    echo "✅ Successfully pushed: $FULL_IMAGE"
+    echo ""
+    echo "📝 Next steps:"
+    echo "   1. Update your HF Space Dockerfile to:"
+    echo "      FROM $FULL_IMAGE"
+    echo ""
+    echo "   2. Update HF Space to use GPU hardware"
+    echo ""
+    echo "   3. Commit and push to HF Spaces:"
+    echo "      cp Dockerfile.gpu Dockerfile"
+    echo "      git add Dockerfile"
+    echo "      git commit -m \"Switch to GPU-enabled image: $FULL_IMAGE\""
+    echo "      git push"
+    echo ""
+    echo "   Your HF Space will deploy with GPU acceleration!"
+else
+    echo "Skipped push. Image is ready locally as: $FULL_IMAGE"
+fi

llm_handler.py CHANGED Viewed

@@ -49,12 +49,21 @@ class CybersecurityLLM:
         # Initialize llama.cpp with the model
         logger.info("Initializing model...")
         self.llm = Llama(
             model_path=model_path,
             n_ctx=4096,  # Context window
             n_batch=512,  # Batch size for prompt processing
-            n_threads=6,  # Use 6 of 8 vCPUs (leave 2 for system)
-            n_gpu_layers=0,  # CPU only
             seed=-1,  # Random seed
             f16_kv=True,  # Use f16 for key/value cache (saves memory)
             logits_all=False,  # Only compute logits for last token

         # Initialize llama.cpp with the model
         logger.info("Initializing model...")
+        # Check for GPU support via environment variable
+        n_gpu_layers = int(os.getenv("N_GPU_LAYERS", "0"))
+        if n_gpu_layers > 0:
+            logger.info(f"GPU acceleration enabled: {n_gpu_layers} layers")
+        else:
+            logger.info("Running in CPU-only mode")
         self.llm = Llama(
             model_path=model_path,
             n_ctx=4096,  # Context window
             n_batch=512,  # Batch size for prompt processing
+            n_threads=6 if n_gpu_layers == 0 else 4,  # Fewer threads needed with GPU
+            n_gpu_layers=n_gpu_layers,  # GPU layers (0 for CPU-only)
             seed=-1,  # Random seed
             f16_kv=True,  # Use f16 for key/value cache (saves memory)
             logits_all=False,  # Only compute logits for last token