ollama-server

Running

App Files Files Community

wwforonce commited on Aug 2, 2025

Commit

6806c38

1 Parent(s): 00ab09e

minor fix ollama

Browse files

Files changed (2) hide show

Dockerfile +36 -1
entrypoint.sh +103 -15

Dockerfile CHANGED Viewed

@@ -13,7 +13,42 @@ RUN mkdir -p $HOME/.ollama && chown -R ollama-user:ollama-user $HOME/.ollama
 # Install netcat (nc) for checking server readiness
 RUN apt-get update && apt-get install -y netcat
-# Copy the entrypoint script before switching users
 COPY entrypoint.sh /usr/local/bin/entrypoint.sh
 # Set permissions for the entrypoint script

 # Install netcat (nc) for checking server readiness
 RUN apt-get update && apt-get install -y netcat
+# Set core environment variables
+ENV HOME=/home/ollama-user
+ENV OLLAMA_HOST=0.0.0.0:7860
+ENV OLLAMA_ORIGINS="*"
+ENV OLLAMA_MODELS=/tmp/ollama-models
+# Ollama performance optimizations
+ENV OLLAMA_NUM_PARALLEL=1
+ENV OLLAMA_MAX_LOADED_MODELS=1
+ENV OLLAMA_MAX_QUEUE=3
+ENV OLLAMA_FLASH_ATTENTION=1
+ENV OLLAMA_KEEP_ALIVE=5m
+ENV OLLAMA_NOPRUNE=false
+# CPU-specific threading optimizations
+ENV OMP_NUM_THREADS=4
+ENV MKL_NUM_THREADS=4
+ENV OPENBLAS_NUM_THREADS=4
+ENV VECLIB_MAXIMUM_THREADS=4
+ENV NUMEXPR_NUM_THREADS=4
+ENV BLAS_NUM_THREADS=4
+# Memory and performance tuning
+ENV OLLAMA_MAX_VRAM=0
+ENV MALLOC_ARENA_MAX=2
+ENV MALLOC_MMAP_THRESHOLD_=131072
+ENV MALLOC_TRIM_THRESHOLD_=131072
+ENV GOMEMLIMIT=10GiB
+ENV GOMAXPROCS=4
+# GPU disable for CPU-only inference
+ENV CUDA_VISIBLE_DEVICES=""
+ENV HIP_VISIBLE_DEVICES=""
+# Copy scripts before user switch
 COPY entrypoint.sh /usr/local/bin/entrypoint.sh
 # Set permissions for the entrypoint script

entrypoint.sh CHANGED Viewed

@@ -1,19 +1,107 @@
 #!/bin/bash
-# Start Ollama server in the background
-# OLLAMA_HOST=0.0.0.0
-OLLAMA_ORIGINS=* ollama serve &
-# Wait for the server to be ready
-while ! nc -z localhost 7860; do
-    echo "Waiting for Ollama server to start..."
-    sleep 1
-done
-# Pull the model
-echo "Pulling the model..."
-#ollama pull gemma3:1b
-ollama pull qwen3:0.6b
-ollama pull nomic-embed-text:latest
-# Keep the container running
-wait

 #!/bin/bash
+set -euo pipefail
+# Logging functions
+log() { echo "[$(date +'%H:%M:%S')] $*"; }
+error() { echo "[$(date +'%H:%M:%S')] ERROR: $*" >&2; }
+# System resource check
+check_system() {
+    local mem_mb=$(free -m | awk 'NR==2{print $7}')
+    local cpu_count=$(nproc)
+    log "Available Memory: ${mem_mb}MB, CPU Cores: ${cpu_count}"
+    # Adjust threading based on resources
+    if [ "$mem_mb" -lt 6000 ]; then
+        export OLLAMA_MAX_QUEUE=2
+        log "Low memory detected - reduced queue size to 2"
+    fi
+    if [ "$cpu_count" -le 2 ]; then
+        export OMP_NUM_THREADS=2
+        export MKL_NUM_THREADS=2
+        log "Limited CPU cores - adjusted thread count"
+    fi
+}
+# Wait for service readiness
+wait_for_service() {
+    log "Starting Ollama server..."
+    ollama serve &
+    local pid=$!
+    # Wait up to 60 seconds for service
+    for i in {1..30}; do
+        if nc -z localhost 7860 2>/dev/null; then
+            log "✓ Ollama service ready on port 7860"
+            return 0
+        fi
+        sleep 2
+    done
+    error "Service failed to start within 60 seconds"
+    kill $pid 2>/dev/null || true
+    return 1
+}
+# Model management
+setup_model() {
+    local model="${PRELOAD_MODEL:-}"
+    if [ -z "$model" ]; then
+        log "No model preloading specified (set PRELOAD_MODEL env var)"
+        return 0
+    fi
+    log "Attempting to preload model: $model"
+    # Try to pull model with timeout
+    if timeout 300 ollama pull "$model" 2>/dev/null; then
+        log "✓ Model $model loaded successfully"
+        # Quick warmup
+        echo "test" | timeout 15 ollama run "$model" >/dev/null 2>&1 || true
+    else
+        log "⚠ Failed to preload $model - will load on demand"
+        # Try lightweight alternatives
+        for fallback in "gemma:2b-instruct-q4_0" "phi:2.7b-chat-v0.2-q4_0"; do
+            log "Trying fallback: $fallback"
+            if timeout 180 ollama pull "$fallback" 2>/dev/null; then
+                log "✓ Fallback model $fallback loaded"
+                export DEFAULT_MODEL="$fallback"
+                break
+            fi
+        done
+    fi
+}
+# Signal handling
+cleanup() {
+    log "Shutting down gracefully..."
+    pkill -f "ollama serve" 2>/dev/null || true
+    exit 0
+}
+trap cleanup SIGTERM SIGINT
+# Main execution
+main() {
+    log "Starting Ollama with CPU optimizations"
+    log "Config: PARALLEL=$OLLAMA_NUM_PARALLEL, QUEUE=$OLLAMA_MAX_QUEUE, KEEP_ALIVE=$OLLAMA_KEEP_ALIVE"
+    check_system
+    if wait_for_service; then
+        setup_model
+        log "🚀 Ollama ready at http://localhost:7860"
+        log "Send requests to /api/generate or /api/chat endpoints"
+        # Keep container alive
+        wait
+    else
+        error "Failed to initialize Ollama"
+        exit 1
+    fi
+}
+main "$@"