Spaces:
Build error
Build error
Dmitry Beresnev
Refactor the C++ LLM manager into modular components, moves Python modules under python/, and keeps the current control-plane behavior intact. The C++ server now has clearer separation for config, model lifecycle, runtime services, request parsing, HTTP helpers, and server routing, while Docker build/runtime paths were updated to compile multiple C++ files and load Python code from the new package folder
332826f | FROM debian:bookworm-slim AS builder | |
| # Install build dependencies | |
| RUN apt-get update && apt-get install -y --no-install-recommends \ | |
| git \ | |
| build-essential \ | |
| cmake \ | |
| ninja-build \ | |
| ca-certificates \ | |
| libcurl4-openssl-dev \ | |
| libssl-dev \ | |
| libboost-dev \ | |
| libopenblas-dev \ | |
| nlohmann-json3-dev \ | |
| pkg-config \ | |
| && rm -rf /var/lib/apt/lists/* | |
| # Clone and build llama.cpp with SSL support for HuggingFace Hub | |
| WORKDIR /build | |
| ARG CACHEBUST=6 | |
| ARG LLAMA_CPP_REF=master | |
| #ARG BUILD_PROFILE=fast_build | |
| ARG BUILD_PROFILE=fast_runtime | |
| ARG BUILD_JOBS=1 | |
| RUN git clone --depth 1 --branch ${LLAMA_CPP_REF} https://github.com/ggerganov/llama.cpp.git && \ | |
| cd llama.cpp && \ | |
| if [ "${BUILD_PROFILE}" = "fast_runtime" ]; then \ | |
| C_FLAGS="-O3 -DNDEBUG"; \ | |
| CXX_FLAGS="-O3 -DNDEBUG"; \ | |
| BLAS_FLAG="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"; \ | |
| JOBS="${BUILD_JOBS}"; \ | |
| else \ | |
| C_FLAGS="-O1 -DNDEBUG"; \ | |
| CXX_FLAGS="-O1 -DNDEBUG"; \ | |
| BLAS_FLAG="-DGGML_BLAS=OFF"; \ | |
| JOBS="1"; \ | |
| fi && \ | |
| cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release \ | |
| -DCMAKE_C_FLAGS_RELEASE="${C_FLAGS}" \ | |
| -DCMAKE_CXX_FLAGS_RELEASE="${CXX_FLAGS}" \ | |
| -DLLAMA_BUILD_TESTS=OFF \ | |
| -DLLAMA_BUILD_EXAMPLES=OFF \ | |
| -DLLAMA_BUILD_SERVER=ON \ | |
| -DGGML_NATIVE=OFF \ | |
| -DGGML_AVX2=ON \ | |
| -DGGML_AVX=ON \ | |
| -DGGML_FMA=ON \ | |
| -DGGML_F16C=ON \ | |
| -DGGML_OPENMP=ON \ | |
| ${BLAS_FLAG} \ | |
| -DLLAMA_CURL=ON \ | |
| -DLLAMA_OPENSSL=ON && \ | |
| cmake --build build --config Release --target llama-server -j"${JOBS}" && \ | |
| echo "=== Binary dependencies ===" && \ | |
| ldd build/bin/llama-server || true | |
| # Build C++ manager (Boost.Beast + JSON) | |
| COPY cpp/ /build/cpp/ | |
| RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \ | |
| /build/cpp/*.cpp -o /build/llm-manager | |
| # Runtime stage | |
| FROM debian:bookworm-slim | |
| # Install runtime dependencies including SSL/HTTPS support | |
| RUN apt-get update && apt-get install -y --no-install-recommends \ | |
| libcurl4 \ | |
| ca-certificates \ | |
| libgomp1 \ | |
| libopenblas0-pthread \ | |
| libstdc++6 \ | |
| openssl \ | |
| && rm -rf /var/lib/apt/lists/* | |
| # Copy llama-server binary and all shared libraries from builder | |
| COPY --from=builder /build/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server | |
| COPY --from=builder /build/llama.cpp/build/bin/*.so.* /usr/local/lib/ | |
| COPY --from=builder /build/llm-manager /usr/local/bin/llm-manager | |
| # Update library cache | |
| RUN ldconfig | |
| # Install Python and FastAPI dependencies | |
| RUN apt-get update && apt-get install -y --no-install-recommends \ | |
| python3 \ | |
| python3-pip \ | |
| && rm -rf /var/lib/apt/lists/* | |
| # Install Python packages | |
| RUN pip3 install --no-cache-dir fastapi uvicorn aiohttp pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages | |
| # Create non-root user | |
| RUN useradd -m -u 1000 user && \ | |
| mkdir -p /home/user/.cache/llama.cpp && \ | |
| chown -R user:user /home/user | |
| # Copy application code | |
| COPY --chown=user:user python/ /home/user/python/ | |
| USER user | |
| WORKDIR /home/user | |
| # Set environment variables | |
| ENV HOME=/home/user \ | |
| LLAMA_CACHE=/home/user/.cache/llama.cpp \ | |
| PATH=/home/user/.local/bin:$PATH \ | |
| PYTHONPATH=/home/user/python \ | |
| PYTHONUNBUFFERED=1 | |
| EXPOSE 7860 | |
| # Start FastAPI app (which manages llama-server internally) | |
| #CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"] | |
| # --- Optional: run llama.cpp C++ server directly (temporary rollout) --- | |
| # Keep the FastAPI CMD above as the default. Uncomment ONE of the following | |
| # to run the C++ server directly instead of the Python app. | |
| # | |
| # Example DeepSeek (4k context): | |
| # CMD ["llama-server", "-hf", "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf", | |
| # "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"] | |
| # | |
| # Example Mixtral-8x7B-Instruct (known loader incompatibilities on newer llama.cpp + too large for 10GB RAM): | |
| # CMD ["llama-server", "-hf", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "64", "--ubatch-size", "32"] | |
| # CMD ["llama-server", "-hf", "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m", "--host", "0.0.0.0", "--port", "7860", "-c", "8192", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "128", "--ubatch-size", "64"] | |
| # | |
| # Active manager process: | |
| # - loads default model at startup | |
| # - supports /switch-model runtime model change | |
| # - proxies /v1/chat/completions to active worker | |
| ENV DEFAULT_MODEL=QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m \ | |
| MANAGER_HOST=0.0.0.0 \ | |
| MANAGER_PORT=7860 \ | |
| WORKER_BASE_PORT=8080 \ | |
| SWITCH_TIMEOUT_SEC=300 \ | |
| MODEL_N_CTX=8192 \ | |
| MODEL_THREADS=4 \ | |
| MODEL_NGL=0 \ | |
| MODEL_BATCH=64 \ | |
| MODEL_UBATCH=32 | |
| CMD ["llm-manager"] | |
| # | |
| # Example Qwen2.5-Coder 7B Instruct (32k context): | |
| # CMD ["llama-server", "-hf", "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Qwen2.5-Coder-7B-Instruct.Q4_K_M.gguf", | |
| # "--host", "0.0.0.0", "--port", "7860", "-c", "32768", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"] | |