Spaces:
Running
Running
minor fix ollama
Browse files- Dockerfile +36 -1
- entrypoint.sh +103 -15
Dockerfile
CHANGED
|
@@ -13,7 +13,42 @@ RUN mkdir -p $HOME/.ollama && chown -R ollama-user:ollama-user $HOME/.ollama
|
|
| 13 |
# Install netcat (nc) for checking server readiness
|
| 14 |
RUN apt-get update && apt-get install -y netcat
|
| 15 |
|
| 16 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
COPY entrypoint.sh /usr/local/bin/entrypoint.sh
|
| 18 |
|
| 19 |
# Set permissions for the entrypoint script
|
|
|
|
| 13 |
# Install netcat (nc) for checking server readiness
|
| 14 |
RUN apt-get update && apt-get install -y netcat
|
| 15 |
|
| 16 |
+
# Set core environment variables
|
| 17 |
+
ENV HOME=/home/ollama-user
|
| 18 |
+
ENV OLLAMA_HOST=0.0.0.0:7860
|
| 19 |
+
ENV OLLAMA_ORIGINS="*"
|
| 20 |
+
ENV OLLAMA_MODELS=/tmp/ollama-models
|
| 21 |
+
|
| 22 |
+
# Ollama performance optimizations
|
| 23 |
+
ENV OLLAMA_NUM_PARALLEL=1
|
| 24 |
+
ENV OLLAMA_MAX_LOADED_MODELS=1
|
| 25 |
+
ENV OLLAMA_MAX_QUEUE=3
|
| 26 |
+
ENV OLLAMA_FLASH_ATTENTION=1
|
| 27 |
+
ENV OLLAMA_KEEP_ALIVE=5m
|
| 28 |
+
ENV OLLAMA_NOPRUNE=false
|
| 29 |
+
|
| 30 |
+
# CPU-specific threading optimizations
|
| 31 |
+
ENV OMP_NUM_THREADS=4
|
| 32 |
+
ENV MKL_NUM_THREADS=4
|
| 33 |
+
ENV OPENBLAS_NUM_THREADS=4
|
| 34 |
+
ENV VECLIB_MAXIMUM_THREADS=4
|
| 35 |
+
ENV NUMEXPR_NUM_THREADS=4
|
| 36 |
+
ENV BLAS_NUM_THREADS=4
|
| 37 |
+
|
| 38 |
+
# Memory and performance tuning
|
| 39 |
+
ENV OLLAMA_MAX_VRAM=0
|
| 40 |
+
ENV MALLOC_ARENA_MAX=2
|
| 41 |
+
ENV MALLOC_MMAP_THRESHOLD_=131072
|
| 42 |
+
ENV MALLOC_TRIM_THRESHOLD_=131072
|
| 43 |
+
ENV GOMEMLIMIT=10GiB
|
| 44 |
+
ENV GOMAXPROCS=4
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# GPU disable for CPU-only inference
|
| 48 |
+
ENV CUDA_VISIBLE_DEVICES=""
|
| 49 |
+
ENV HIP_VISIBLE_DEVICES=""
|
| 50 |
+
|
| 51 |
+
# Copy scripts before user switch
|
| 52 |
COPY entrypoint.sh /usr/local/bin/entrypoint.sh
|
| 53 |
|
| 54 |
# Set permissions for the entrypoint script
|
entrypoint.sh
CHANGED
|
@@ -1,19 +1,107 @@
|
|
| 1 |
#!/bin/bash
|
|
|
|
| 2 |
|
| 3 |
-
#
|
| 4 |
-
|
| 5 |
-
|
| 6 |
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
ollama
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
|
| 4 |
+
# Logging functions
|
| 5 |
+
log() { echo "[$(date +'%H:%M:%S')] $*"; }
|
| 6 |
+
error() { echo "[$(date +'%H:%M:%S')] ERROR: $*" >&2; }
|
| 7 |
|
| 8 |
+
# System resource check
|
| 9 |
+
check_system() {
|
| 10 |
+
local mem_mb=$(free -m | awk 'NR==2{print $7}')
|
| 11 |
+
local cpu_count=$(nproc)
|
| 12 |
+
|
| 13 |
+
log "Available Memory: ${mem_mb}MB, CPU Cores: ${cpu_count}"
|
| 14 |
+
|
| 15 |
+
# Adjust threading based on resources
|
| 16 |
+
if [ "$mem_mb" -lt 6000 ]; then
|
| 17 |
+
export OLLAMA_MAX_QUEUE=2
|
| 18 |
+
log "Low memory detected - reduced queue size to 2"
|
| 19 |
+
fi
|
| 20 |
+
|
| 21 |
+
if [ "$cpu_count" -le 2 ]; then
|
| 22 |
+
export OMP_NUM_THREADS=2
|
| 23 |
+
export MKL_NUM_THREADS=2
|
| 24 |
+
log "Limited CPU cores - adjusted thread count"
|
| 25 |
+
fi
|
| 26 |
+
}
|
| 27 |
|
| 28 |
+
# Wait for service readiness
|
| 29 |
+
wait_for_service() {
|
| 30 |
+
log "Starting Ollama server..."
|
| 31 |
+
ollama serve &
|
| 32 |
+
local pid=$!
|
| 33 |
+
|
| 34 |
+
# Wait up to 60 seconds for service
|
| 35 |
+
for i in {1..30}; do
|
| 36 |
+
if nc -z localhost 7860 2>/dev/null; then
|
| 37 |
+
log "✓ Ollama service ready on port 7860"
|
| 38 |
+
return 0
|
| 39 |
+
fi
|
| 40 |
+
sleep 2
|
| 41 |
+
done
|
| 42 |
+
|
| 43 |
+
error "Service failed to start within 60 seconds"
|
| 44 |
+
kill $pid 2>/dev/null || true
|
| 45 |
+
return 1
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# Model management
|
| 49 |
+
setup_model() {
|
| 50 |
+
local model="${PRELOAD_MODEL:-}"
|
| 51 |
+
|
| 52 |
+
if [ -z "$model" ]; then
|
| 53 |
+
log "No model preloading specified (set PRELOAD_MODEL env var)"
|
| 54 |
+
return 0
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
log "Attempting to preload model: $model"
|
| 58 |
+
|
| 59 |
+
# Try to pull model with timeout
|
| 60 |
+
if timeout 300 ollama pull "$model" 2>/dev/null; then
|
| 61 |
+
log "✓ Model $model loaded successfully"
|
| 62 |
+
# Quick warmup
|
| 63 |
+
echo "test" | timeout 15 ollama run "$model" >/dev/null 2>&1 || true
|
| 64 |
+
else
|
| 65 |
+
log "⚠ Failed to preload $model - will load on demand"
|
| 66 |
+
|
| 67 |
+
# Try lightweight alternatives
|
| 68 |
+
for fallback in "gemma:2b-instruct-q4_0" "phi:2.7b-chat-v0.2-q4_0"; do
|
| 69 |
+
log "Trying fallback: $fallback"
|
| 70 |
+
if timeout 180 ollama pull "$fallback" 2>/dev/null; then
|
| 71 |
+
log "✓ Fallback model $fallback loaded"
|
| 72 |
+
export DEFAULT_MODEL="$fallback"
|
| 73 |
+
break
|
| 74 |
+
fi
|
| 75 |
+
done
|
| 76 |
+
fi
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
# Signal handling
|
| 80 |
+
cleanup() {
|
| 81 |
+
log "Shutting down gracefully..."
|
| 82 |
+
pkill -f "ollama serve" 2>/dev/null || true
|
| 83 |
+
exit 0
|
| 84 |
+
}
|
| 85 |
+
trap cleanup SIGTERM SIGINT
|
| 86 |
+
|
| 87 |
+
# Main execution
|
| 88 |
+
main() {
|
| 89 |
+
log "Starting Ollama with CPU optimizations"
|
| 90 |
+
log "Config: PARALLEL=$OLLAMA_NUM_PARALLEL, QUEUE=$OLLAMA_MAX_QUEUE, KEEP_ALIVE=$OLLAMA_KEEP_ALIVE"
|
| 91 |
+
|
| 92 |
+
check_system
|
| 93 |
+
|
| 94 |
+
if wait_for_service; then
|
| 95 |
+
setup_model
|
| 96 |
+
log "🚀 Ollama ready at http://localhost:7860"
|
| 97 |
+
log "Send requests to /api/generate or /api/chat endpoints"
|
| 98 |
+
|
| 99 |
+
# Keep container alive
|
| 100 |
+
wait
|
| 101 |
+
else
|
| 102 |
+
error "Failed to initialize Ollama"
|
| 103 |
+
exit 1
|
| 104 |
+
fi
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
main "$@"
|