wwforonce commited on
Commit
6806c38
·
1 Parent(s): 00ab09e

minor fix ollama

Browse files
Files changed (2) hide show
  1. Dockerfile +36 -1
  2. entrypoint.sh +103 -15
Dockerfile CHANGED
@@ -13,7 +13,42 @@ RUN mkdir -p $HOME/.ollama && chown -R ollama-user:ollama-user $HOME/.ollama
13
  # Install netcat (nc) for checking server readiness
14
  RUN apt-get update && apt-get install -y netcat
15
 
16
- # Copy the entrypoint script before switching users
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  COPY entrypoint.sh /usr/local/bin/entrypoint.sh
18
 
19
  # Set permissions for the entrypoint script
 
13
  # Install netcat (nc) for checking server readiness
14
  RUN apt-get update && apt-get install -y netcat
15
 
16
+ # Set core environment variables
17
+ ENV HOME=/home/ollama-user
18
+ ENV OLLAMA_HOST=0.0.0.0:7860
19
+ ENV OLLAMA_ORIGINS="*"
20
+ ENV OLLAMA_MODELS=/tmp/ollama-models
21
+
22
+ # Ollama performance optimizations
23
+ ENV OLLAMA_NUM_PARALLEL=1
24
+ ENV OLLAMA_MAX_LOADED_MODELS=1
25
+ ENV OLLAMA_MAX_QUEUE=3
26
+ ENV OLLAMA_FLASH_ATTENTION=1
27
+ ENV OLLAMA_KEEP_ALIVE=5m
28
+ ENV OLLAMA_NOPRUNE=false
29
+
30
+ # CPU-specific threading optimizations
31
+ ENV OMP_NUM_THREADS=4
32
+ ENV MKL_NUM_THREADS=4
33
+ ENV OPENBLAS_NUM_THREADS=4
34
+ ENV VECLIB_MAXIMUM_THREADS=4
35
+ ENV NUMEXPR_NUM_THREADS=4
36
+ ENV BLAS_NUM_THREADS=4
37
+
38
+ # Memory and performance tuning
39
+ ENV OLLAMA_MAX_VRAM=0
40
+ ENV MALLOC_ARENA_MAX=2
41
+ ENV MALLOC_MMAP_THRESHOLD_=131072
42
+ ENV MALLOC_TRIM_THRESHOLD_=131072
43
+ ENV GOMEMLIMIT=10GiB
44
+ ENV GOMAXPROCS=4
45
+
46
+
47
+ # GPU disable for CPU-only inference
48
+ ENV CUDA_VISIBLE_DEVICES=""
49
+ ENV HIP_VISIBLE_DEVICES=""
50
+
51
+ # Copy scripts before user switch
52
  COPY entrypoint.sh /usr/local/bin/entrypoint.sh
53
 
54
  # Set permissions for the entrypoint script
entrypoint.sh CHANGED
@@ -1,19 +1,107 @@
1
  #!/bin/bash
 
2
 
3
- # Start Ollama server in the background
4
- # OLLAMA_HOST=0.0.0.0
5
- OLLAMA_ORIGINS=* ollama serve &
6
 
7
- # Wait for the server to be ready
8
- while ! nc -z localhost 7860; do
9
- echo "Waiting for Ollama server to start..."
10
- sleep 1
11
- done
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Pull the model
14
- echo "Pulling the model..."
15
- #ollama pull gemma3:1b
16
- ollama pull qwen3:0.6b
17
- ollama pull nomic-embed-text:latest
18
- # Keep the container running
19
- wait
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  #!/bin/bash
2
+ set -euo pipefail
3
 
4
+ # Logging functions
5
+ log() { echo "[$(date +'%H:%M:%S')] $*"; }
6
+ error() { echo "[$(date +'%H:%M:%S')] ERROR: $*" >&2; }
7
 
8
+ # System resource check
9
+ check_system() {
10
+ local mem_mb=$(free -m | awk 'NR==2{print $7}')
11
+ local cpu_count=$(nproc)
12
+
13
+ log "Available Memory: ${mem_mb}MB, CPU Cores: ${cpu_count}"
14
+
15
+ # Adjust threading based on resources
16
+ if [ "$mem_mb" -lt 6000 ]; then
17
+ export OLLAMA_MAX_QUEUE=2
18
+ log "Low memory detected - reduced queue size to 2"
19
+ fi
20
+
21
+ if [ "$cpu_count" -le 2 ]; then
22
+ export OMP_NUM_THREADS=2
23
+ export MKL_NUM_THREADS=2
24
+ log "Limited CPU cores - adjusted thread count"
25
+ fi
26
+ }
27
 
28
+ # Wait for service readiness
29
+ wait_for_service() {
30
+ log "Starting Ollama server..."
31
+ ollama serve &
32
+ local pid=$!
33
+
34
+ # Wait up to 60 seconds for service
35
+ for i in {1..30}; do
36
+ if nc -z localhost 7860 2>/dev/null; then
37
+ log "✓ Ollama service ready on port 7860"
38
+ return 0
39
+ fi
40
+ sleep 2
41
+ done
42
+
43
+ error "Service failed to start within 60 seconds"
44
+ kill $pid 2>/dev/null || true
45
+ return 1
46
+ }
47
+
48
+ # Model management
49
+ setup_model() {
50
+ local model="${PRELOAD_MODEL:-}"
51
+
52
+ if [ -z "$model" ]; then
53
+ log "No model preloading specified (set PRELOAD_MODEL env var)"
54
+ return 0
55
+ fi
56
+
57
+ log "Attempting to preload model: $model"
58
+
59
+ # Try to pull model with timeout
60
+ if timeout 300 ollama pull "$model" 2>/dev/null; then
61
+ log "✓ Model $model loaded successfully"
62
+ # Quick warmup
63
+ echo "test" | timeout 15 ollama run "$model" >/dev/null 2>&1 || true
64
+ else
65
+ log "⚠ Failed to preload $model - will load on demand"
66
+
67
+ # Try lightweight alternatives
68
+ for fallback in "gemma:2b-instruct-q4_0" "phi:2.7b-chat-v0.2-q4_0"; do
69
+ log "Trying fallback: $fallback"
70
+ if timeout 180 ollama pull "$fallback" 2>/dev/null; then
71
+ log "✓ Fallback model $fallback loaded"
72
+ export DEFAULT_MODEL="$fallback"
73
+ break
74
+ fi
75
+ done
76
+ fi
77
+ }
78
+
79
+ # Signal handling
80
+ cleanup() {
81
+ log "Shutting down gracefully..."
82
+ pkill -f "ollama serve" 2>/dev/null || true
83
+ exit 0
84
+ }
85
+ trap cleanup SIGTERM SIGINT
86
+
87
+ # Main execution
88
+ main() {
89
+ log "Starting Ollama with CPU optimizations"
90
+ log "Config: PARALLEL=$OLLAMA_NUM_PARALLEL, QUEUE=$OLLAMA_MAX_QUEUE, KEEP_ALIVE=$OLLAMA_KEEP_ALIVE"
91
+
92
+ check_system
93
+
94
+ if wait_for_service; then
95
+ setup_model
96
+ log "🚀 Ollama ready at http://localhost:7860"
97
+ log "Send requests to /api/generate or /api/chat endpoints"
98
+
99
+ # Keep container alive
100
+ wait
101
+ else
102
+ error "Failed to initialize Ollama"
103
+ exit 1
104
+ fi
105
+ }
106
+
107
+ main "$@"