Spaces:
Building
Building
File size: 5,627 Bytes
7f69342 441479b 7f69342 4ec7108 7f69342 6e29991 1a4efad 7f69342 09e70ff 950f41b fc0860f a97386f fc0860f 950f41b 4ec7108 f64a284 7f69342 950f41b 0e913e4 a97386f c33410f 0e913e4 7f69342 a97386f 1a4efad a97386f c33410f 309e664 1a4efad a97386f 950f41b a97386f db57dc8 58d70b1 441479b fc0860f 332826f fc0860f acdc6c1 fc0860f 7f69342 7b82554 f64a284 7f69342 09e70ff 7f69342 d9a4451 a97386f d9a4451 f64a284 7f69342 441479b 8c68c1f 7f69342 58d70b1 fc0860f 8c68c1f ba2be63 dde400a 7763bf4 dde400a 7f69342 6e29991 7f69342 cba98c9 6e29991 dde400a 332826f dde400a ba2be63 7f69342 6e29991 7f69342 ba2be63 7f69342 dde400a 332826f dde400a ba2be63 dde400a 9a590ac fe7089d fc0860f fe7089d fc0860f 057edf0 fc0860f 9a590ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | FROM debian:bookworm-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
build-essential \
cmake \
ninja-build \
ca-certificates \
libcurl4-openssl-dev \
libssl-dev \
libboost-dev \
libopenblas-dev \
nlohmann-json3-dev \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
# Clone and build llama.cpp with SSL support for HuggingFace Hub
WORKDIR /build
ARG CACHEBUST=6
ARG LLAMA_CPP_REF=master
#ARG BUILD_PROFILE=fast_build
ARG BUILD_PROFILE=fast_runtime
ARG BUILD_JOBS=1
RUN git clone --depth 1 --branch ${LLAMA_CPP_REF} https://github.com/ggerganov/llama.cpp.git && \
cd llama.cpp && \
if [ "${BUILD_PROFILE}" = "fast_runtime" ]; then \
C_FLAGS="-O3 -DNDEBUG"; \
CXX_FLAGS="-O3 -DNDEBUG"; \
BLAS_FLAG="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"; \
JOBS="${BUILD_JOBS}"; \
else \
C_FLAGS="-O1 -DNDEBUG"; \
CXX_FLAGS="-O1 -DNDEBUG"; \
BLAS_FLAG="-DGGML_BLAS=OFF"; \
JOBS="1"; \
fi && \
cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_FLAGS_RELEASE="${C_FLAGS}" \
-DCMAKE_CXX_FLAGS_RELEASE="${CXX_FLAGS}" \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DGGML_NATIVE=OFF \
-DGGML_AVX2=ON \
-DGGML_AVX=ON \
-DGGML_FMA=ON \
-DGGML_F16C=ON \
-DGGML_OPENMP=ON \
${BLAS_FLAG} \
-DLLAMA_CURL=ON \
-DLLAMA_OPENSSL=ON && \
cmake --build build --config Release --target llama-server -j"${JOBS}" && \
echo "=== Binary dependencies ===" && \
ldd build/bin/llama-server || true && \
mkdir -p /build/llama-libs && \
find build -type f \( -name '*.so' -o -name '*.so.*' \) -exec cp -v {} /build/llama-libs/ \; || true
# Build C++ manager (Boost.Beast + JSON)
COPY cpp/ /build/cpp/
RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
-I/build/cpp \
/build/cpp/config.cpp \
/build/cpp/http_helpers.cpp \
/build/cpp/llm_manager.cpp \
/build/cpp/llm_manager_types.cpp \
/build/cpp/model_manager.cpp \
/build/cpp/request_parsing.cpp \
/build/cpp/runtime_components.cpp \
/build/cpp/server.cpp \
-o /build/llm-manager
# Runtime stage
FROM debian:bookworm-slim
# Install runtime dependencies including SSL/HTTPS support
RUN apt-get update && apt-get install -y --no-install-recommends \
libcurl4 \
ca-certificates \
libgomp1 \
libopenblas0-pthread \
libstdc++6 \
openssl \
&& rm -rf /var/lib/apt/lists/*
# Copy llama-server binary and all shared libraries from builder
COPY --from=builder /build/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server
COPY --from=builder /build/llama-libs/ /usr/local/lib/
COPY --from=builder /build/llm-manager /usr/local/bin/llm-manager
# Update library cache
RUN ldconfig
# Install Python and FastAPI dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
# Install Python packages
RUN pip3 install --no-cache-dir fastapi uvicorn aiohttp pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages
# Create non-root user
RUN useradd -m -u 1000 user && \
mkdir -p /home/user/.cache/llama.cpp && \
chown -R user:user /home/user
# Copy application code
COPY --chown=user:user python/ /home/user/python/
USER user
WORKDIR /home/user
# Set environment variables
ENV HOME=/home/user \
LLAMA_CACHE=/home/user/.cache/llama.cpp \
PATH=/home/user/.local/bin:$PATH \
PYTHONPATH=/home/user/python \
PYTHONUNBUFFERED=1
EXPOSE 7860
# Start FastAPI app (which manages llama-server internally)
#CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
# --- Optional: run llama.cpp C++ server directly (temporary rollout) ---
# Keep the FastAPI CMD above as the default. Uncomment ONE of the following
# to run the C++ server directly instead of the Python app.
#
# Example DeepSeek (4k context):
# CMD ["llama-server", "-hf", "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf",
# "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"]
#
# Example Mixtral-8x7B-Instruct (known loader incompatibilities on newer llama.cpp + too large for 10GB RAM):
# CMD ["llama-server", "-hf", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "64", "--ubatch-size", "32"]
# CMD ["llama-server", "-hf", "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m", "--host", "0.0.0.0", "--port", "7860", "-c", "8192", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "128", "--ubatch-size", "64"]
#
# Active manager process:
# - loads default model at startup
# - supports /switch-model runtime model change
# - proxies /v1/chat/completions to active worker
ENV DEFAULT_MODEL=QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m \
MANAGER_HOST=0.0.0.0 \
MANAGER_PORT=7860 \
WORKER_BASE_PORT=8080 \
SWITCH_TIMEOUT_SEC=300 \
MODEL_N_CTX=8192 \
MODEL_THREADS=4 \
MODEL_NGL=0 \
MODEL_BATCH=64 \
MODEL_UBATCH=32
CMD ["llm-manager"]
#
# Example Qwen2.5-Coder 7B Instruct (32k context):
# CMD ["llama-server", "-hf", "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Qwen2.5-Coder-7B-Instruct.Q4_K_M.gguf",
# "--host", "0.0.0.0", "--port", "7860", "-c", "32768", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"]
|