AGI / Dockerfile
Dmitry Beresnev
Refactor the C++ LLM manager into modular components, moves Python modules under python/, and keeps the current control-plane behavior intact. The C++ server now has clearer separation for config, model lifecycle, runtime services, request parsing, HTTP helpers, and server routing, while Docker build/runtime paths were updated to compile multiple C++ files and load Python code from the new package folder
332826f
FROM debian:bookworm-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
build-essential \
cmake \
ninja-build \
ca-certificates \
libcurl4-openssl-dev \
libssl-dev \
libboost-dev \
libopenblas-dev \
nlohmann-json3-dev \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
# Clone and build llama.cpp with SSL support for HuggingFace Hub
WORKDIR /build
ARG CACHEBUST=6
ARG LLAMA_CPP_REF=master
#ARG BUILD_PROFILE=fast_build
ARG BUILD_PROFILE=fast_runtime
ARG BUILD_JOBS=1
RUN git clone --depth 1 --branch ${LLAMA_CPP_REF} https://github.com/ggerganov/llama.cpp.git && \
cd llama.cpp && \
if [ "${BUILD_PROFILE}" = "fast_runtime" ]; then \
C_FLAGS="-O3 -DNDEBUG"; \
CXX_FLAGS="-O3 -DNDEBUG"; \
BLAS_FLAG="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"; \
JOBS="${BUILD_JOBS}"; \
else \
C_FLAGS="-O1 -DNDEBUG"; \
CXX_FLAGS="-O1 -DNDEBUG"; \
BLAS_FLAG="-DGGML_BLAS=OFF"; \
JOBS="1"; \
fi && \
cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_FLAGS_RELEASE="${C_FLAGS}" \
-DCMAKE_CXX_FLAGS_RELEASE="${CXX_FLAGS}" \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DGGML_NATIVE=OFF \
-DGGML_AVX2=ON \
-DGGML_AVX=ON \
-DGGML_FMA=ON \
-DGGML_F16C=ON \
-DGGML_OPENMP=ON \
${BLAS_FLAG} \
-DLLAMA_CURL=ON \
-DLLAMA_OPENSSL=ON && \
cmake --build build --config Release --target llama-server -j"${JOBS}" && \
echo "=== Binary dependencies ===" && \
ldd build/bin/llama-server || true
# Build C++ manager (Boost.Beast + JSON)
COPY cpp/ /build/cpp/
RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
/build/cpp/*.cpp -o /build/llm-manager
# Runtime stage
FROM debian:bookworm-slim
# Install runtime dependencies including SSL/HTTPS support
RUN apt-get update && apt-get install -y --no-install-recommends \
libcurl4 \
ca-certificates \
libgomp1 \
libopenblas0-pthread \
libstdc++6 \
openssl \
&& rm -rf /var/lib/apt/lists/*
# Copy llama-server binary and all shared libraries from builder
COPY --from=builder /build/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server
COPY --from=builder /build/llama.cpp/build/bin/*.so.* /usr/local/lib/
COPY --from=builder /build/llm-manager /usr/local/bin/llm-manager
# Update library cache
RUN ldconfig
# Install Python and FastAPI dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
# Install Python packages
RUN pip3 install --no-cache-dir fastapi uvicorn aiohttp pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages
# Create non-root user
RUN useradd -m -u 1000 user && \
mkdir -p /home/user/.cache/llama.cpp && \
chown -R user:user /home/user
# Copy application code
COPY --chown=user:user python/ /home/user/python/
USER user
WORKDIR /home/user
# Set environment variables
ENV HOME=/home/user \
LLAMA_CACHE=/home/user/.cache/llama.cpp \
PATH=/home/user/.local/bin:$PATH \
PYTHONPATH=/home/user/python \
PYTHONUNBUFFERED=1
EXPOSE 7860
# Start FastAPI app (which manages llama-server internally)
#CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
# --- Optional: run llama.cpp C++ server directly (temporary rollout) ---
# Keep the FastAPI CMD above as the default. Uncomment ONE of the following
# to run the C++ server directly instead of the Python app.
#
# Example DeepSeek (4k context):
# CMD ["llama-server", "-hf", "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf",
# "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"]
#
# Example Mixtral-8x7B-Instruct (known loader incompatibilities on newer llama.cpp + too large for 10GB RAM):
# CMD ["llama-server", "-hf", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "64", "--ubatch-size", "32"]
# CMD ["llama-server", "-hf", "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m", "--host", "0.0.0.0", "--port", "7860", "-c", "8192", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "128", "--ubatch-size", "64"]
#
# Active manager process:
# - loads default model at startup
# - supports /switch-model runtime model change
# - proxies /v1/chat/completions to active worker
ENV DEFAULT_MODEL=QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m \
MANAGER_HOST=0.0.0.0 \
MANAGER_PORT=7860 \
WORKER_BASE_PORT=8080 \
SWITCH_TIMEOUT_SEC=300 \
MODEL_N_CTX=8192 \
MODEL_THREADS=4 \
MODEL_NGL=0 \
MODEL_BATCH=64 \
MODEL_UBATCH=32
CMD ["llm-manager"]
#
# Example Qwen2.5-Coder 7B Instruct (32k context):
# CMD ["llama-server", "-hf", "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Qwen2.5-Coder-7B-Instruct.Q4_K_M.gguf",
# "--host", "0.0.0.0", "--port", "7860", "-c", "32768", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"]