FROM debian:bookworm-slim AS builder # Install build dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ git \ build-essential \ cmake \ ninja-build \ ca-certificates \ libcurl4-openssl-dev \ libssl-dev \ libboost-dev \ libopenblas-dev \ nlohmann-json3-dev \ pkg-config \ && rm -rf /var/lib/apt/lists/* # Clone and build llama.cpp with SSL support for HuggingFace Hub WORKDIR /build ARG CACHEBUST=6 ARG LLAMA_CPP_REF=master #ARG BUILD_PROFILE=fast_build ARG BUILD_PROFILE=fast_runtime ARG BUILD_JOBS=1 RUN git clone --depth 1 --branch ${LLAMA_CPP_REF} https://github.com/ggerganov/llama.cpp.git && \ cd llama.cpp && \ if [ "${BUILD_PROFILE}" = "fast_runtime" ]; then \ C_FLAGS="-O3 -DNDEBUG"; \ CXX_FLAGS="-O3 -DNDEBUG"; \ BLAS_FLAG="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"; \ JOBS="${BUILD_JOBS}"; \ else \ C_FLAGS="-O1 -DNDEBUG"; \ CXX_FLAGS="-O1 -DNDEBUG"; \ BLAS_FLAG="-DGGML_BLAS=OFF"; \ JOBS="1"; \ fi && \ cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_FLAGS_RELEASE="${C_FLAGS}" \ -DCMAKE_CXX_FLAGS_RELEASE="${CXX_FLAGS}" \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_SERVER=ON \ -DGGML_NATIVE=OFF \ -DGGML_AVX2=ON \ -DGGML_AVX=ON \ -DGGML_FMA=ON \ -DGGML_F16C=ON \ -DGGML_OPENMP=ON \ ${BLAS_FLAG} \ -DLLAMA_CURL=ON \ -DLLAMA_OPENSSL=ON && \ cmake --build build --config Release --target llama-server -j"${JOBS}" && \ echo "=== Binary dependencies ===" && \ ldd build/bin/llama-server || true && \ mkdir -p /build/llama-libs && \ find build -type f \( -name '*.so' -o -name '*.so.*' \) -exec cp -v {} /build/llama-libs/ \; || true # Build C++ manager (Boost.Beast + JSON) COPY cpp/ /build/cpp/ RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \ -I/build/cpp \ /build/cpp/config.cpp \ /build/cpp/http_helpers.cpp \ /build/cpp/llm_manager.cpp \ /build/cpp/llm_manager_types.cpp \ /build/cpp/model_manager.cpp \ /build/cpp/request_parsing.cpp \ /build/cpp/runtime_components.cpp \ /build/cpp/server.cpp \ -o /build/llm-manager # Runtime stage FROM debian:bookworm-slim # Install runtime dependencies including SSL/HTTPS support RUN apt-get update && apt-get install -y --no-install-recommends \ libcurl4 \ ca-certificates \ libgomp1 \ libopenblas0-pthread \ libstdc++6 \ openssl \ && rm -rf /var/lib/apt/lists/* # Copy llama-server binary and all shared libraries from builder COPY --from=builder /build/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server COPY --from=builder /build/llama-libs/ /usr/local/lib/ COPY --from=builder /build/llm-manager /usr/local/bin/llm-manager # Update library cache RUN ldconfig # Install Python and FastAPI dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ python3-pip \ && rm -rf /var/lib/apt/lists/* # Install Python packages RUN pip3 install --no-cache-dir fastapi uvicorn aiohttp pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages # Create non-root user RUN useradd -m -u 1000 user && \ mkdir -p /home/user/.cache/llama.cpp && \ chown -R user:user /home/user # Copy application code COPY --chown=user:user python/ /home/user/python/ USER user WORKDIR /home/user # Set environment variables ENV HOME=/home/user \ LLAMA_CACHE=/home/user/.cache/llama.cpp \ PATH=/home/user/.local/bin:$PATH \ PYTHONPATH=/home/user/python \ PYTHONUNBUFFERED=1 EXPOSE 7860 # Start FastAPI app (which manages llama-server internally) #CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"] # --- Optional: run llama.cpp C++ server directly (temporary rollout) --- # Keep the FastAPI CMD above as the default. Uncomment ONE of the following # to run the C++ server directly instead of the Python app. # # Example DeepSeek (4k context): # CMD ["llama-server", "-hf", "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf", # "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"] # # Example Mixtral-8x7B-Instruct (known loader incompatibilities on newer llama.cpp + too large for 10GB RAM): # CMD ["llama-server", "-hf", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "64", "--ubatch-size", "32"] # CMD ["llama-server", "-hf", "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m", "--host", "0.0.0.0", "--port", "7860", "-c", "8192", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "128", "--ubatch-size", "64"] # # Active manager process: # - loads default model at startup # - supports /switch-model runtime model change # - proxies /v1/chat/completions to active worker ENV DEFAULT_MODEL=QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m \ MANAGER_HOST=0.0.0.0 \ MANAGER_PORT=7860 \ WORKER_BASE_PORT=8080 \ SWITCH_TIMEOUT_SEC=300 \ MODEL_N_CTX=8192 \ MODEL_THREADS=4 \ MODEL_NGL=0 \ MODEL_BATCH=64 \ MODEL_UBATCH=32 CMD ["llm-manager"] # # Example Qwen2.5-Coder 7B Instruct (32k context): # CMD ["llama-server", "-hf", "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Qwen2.5-Coder-7B-Instruct.Q4_K_M.gguf", # "--host", "0.0.0.0", "--port", "7860", "-c", "32768", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"]