File size: 2,829 Bytes
38aa070
6806c38
38aa070
6806c38
 
 
38aa070
6806c38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38aa070
6806c38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/bin/bash
set -euo pipefail

# Logging functions
log() { echo "[$(date +'%H:%M:%S')] $*"; }
error() { echo "[$(date +'%H:%M:%S')] ERROR: $*" >&2; }

# System resource check
check_system() {
    local mem_mb=$(free -m | awk 'NR==2{print $7}')
    local cpu_count=$(nproc)
    
    log "Available Memory: ${mem_mb}MB, CPU Cores: ${cpu_count}"
    
    # Adjust threading based on resources
    if [ "$mem_mb" -lt 6000 ]; then
        export OLLAMA_MAX_QUEUE=2
        log "Low memory detected - reduced queue size to 2"
    fi
    
    if [ "$cpu_count" -le 2 ]; then
        export OMP_NUM_THREADS=2
        export MKL_NUM_THREADS=2
        log "Limited CPU cores - adjusted thread count"
    fi
}

# Wait for service readiness
wait_for_service() {
    log "Starting Ollama server..."
    ollama serve &
    local pid=$!
    
    # Wait up to 60 seconds for service
    for i in {1..30}; do
        if nc -z localhost 7860 2>/dev/null; then
            log "✓ Ollama service ready on port 7860"
            return 0
        fi
        sleep 2
    done
    
    error "Service failed to start within 60 seconds"
    kill $pid 2>/dev/null || true
    return 1
}

# Model management
setup_model() {
    local model="${PRELOAD_MODEL:-}"
    
    if [ -z "$model" ]; then
        log "No model preloading specified (set PRELOAD_MODEL env var)"
        return 0
    fi
    
    log "Attempting to preload model: $model"
    
    # Try to pull model with timeout
    if timeout 300 ollama pull "$model" 2>/dev/null; then
        log "✓ Model $model loaded successfully"
        # Quick warmup
        echo "test" | timeout 15 ollama run "$model" >/dev/null 2>&1 || true
    else
        log "⚠ Failed to preload $model - will load on demand"
        
        # Try lightweight alternatives
        for fallback in "gemma:2b-instruct-q4_0" "phi:2.7b-chat-v0.2-q4_0"; do
            log "Trying fallback: $fallback"
            if timeout 180 ollama pull "$fallback" 2>/dev/null; then
                log "✓ Fallback model $fallback loaded"
                export DEFAULT_MODEL="$fallback"
                break
            fi
        done
    fi
}

# Signal handling
cleanup() {
    log "Shutting down gracefully..."
    pkill -f "ollama serve" 2>/dev/null || true
    exit 0
}
trap cleanup SIGTERM SIGINT

# Main execution
main() {
    log "Starting Ollama with CPU optimizations"
    log "Config: PARALLEL=$OLLAMA_NUM_PARALLEL, QUEUE=$OLLAMA_MAX_QUEUE, KEEP_ALIVE=$OLLAMA_KEEP_ALIVE"
    
    check_system
    
    if wait_for_service; then
        setup_model
        log "🚀 Ollama ready at http://localhost:7860"
        log "Send requests to /api/generate or /api/chat endpoints"
        
        # Keep container alive
        wait
    else
        error "Failed to initialize Ollama"
        exit 1
    fi
}

main "$@"