Spaces:

ResearchEngineering
/

AGI

Sleeping

File size: 5,721 Bytes

7f69342
441479b
7f69342
4ec7108
7f69342
 
6e29991
1a4efad
7f69342
09e70ff
950f41b
fc0860f
a97386f
fc0860f
950f41b
4ec7108
 
f64a284
7f69342
950f41b
0e913e4
a97386f
 
c33410f
0e913e4
7f69342
a97386f
 
 
 
 
 
 
 
 
 
 
1a4efad
a97386f
 
c33410f
 
 
309e664
1a4efad
 
 
 
 
a97386f
950f41b
 
a97386f
db57dc8
58d70b1
 
 
441479b
fc0860f
332826f
fc0860f
acdc6c1
 
 
 
 
 
 
 
 
 
fc0860f
7f69342
 
7b82554
f64a284
7f69342
09e70ff
7f69342
d9a4451
a97386f
d9a4451
f64a284
7f69342
441479b
8c68c1f
7f69342
58d70b1
fc0860f
8c68c1f
 
 
ba2be63
dde400a
 
 
 
 
 
 
7763bf4
dde400a
7f69342
6e29991
7f69342
cba98c9
6e29991
dde400a
332826f
dde400a
ba2be63
7f69342
6e29991
7f69342
ba2be63
7f69342
dde400a
332826f
dde400a
ba2be63
 
 
dde400a
9a590ac
 
 
 
 
 
 
 
 
 
fe7089d
 
fc0860f
 
fe7089d
fc0860f
 
 
 
 
 
 
 
 
2c31416
 
 
fc0860f
 
 
057edf0
 
fc0860f
 
9a590ac

FROM debian:bookworm-slim AS builder

# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    build-essential \
    cmake \
    ninja-build \
    ca-certificates \
    libcurl4-openssl-dev \
    libssl-dev \
    libboost-dev \
    libopenblas-dev \
    nlohmann-json3-dev \
    pkg-config \
    && rm -rf /var/lib/apt/lists/*

# Clone and build llama.cpp with SSL support for HuggingFace Hub
WORKDIR /build
ARG CACHEBUST=6
ARG LLAMA_CPP_REF=master
#ARG BUILD_PROFILE=fast_build
ARG BUILD_PROFILE=fast_runtime
ARG BUILD_JOBS=1
RUN git clone --depth 1 --branch ${LLAMA_CPP_REF} https://github.com/ggerganov/llama.cpp.git && \
    cd llama.cpp && \
    if [ "${BUILD_PROFILE}" = "fast_runtime" ]; then \
        C_FLAGS="-O3 -DNDEBUG"; \
        CXX_FLAGS="-O3 -DNDEBUG"; \
        BLAS_FLAG="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"; \
        JOBS="${BUILD_JOBS}"; \
    else \
        C_FLAGS="-O1 -DNDEBUG"; \
        CXX_FLAGS="-O1 -DNDEBUG"; \
        BLAS_FLAG="-DGGML_BLAS=OFF"; \
        JOBS="1"; \
    fi && \
    cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release \
        -DCMAKE_C_FLAGS_RELEASE="${C_FLAGS}" \
        -DCMAKE_CXX_FLAGS_RELEASE="${CXX_FLAGS}" \
        -DLLAMA_BUILD_TESTS=OFF \
        -DLLAMA_BUILD_EXAMPLES=OFF \
        -DLLAMA_BUILD_SERVER=ON \
        -DGGML_NATIVE=OFF \
        -DGGML_AVX2=ON \
        -DGGML_AVX=ON \
        -DGGML_FMA=ON \
        -DGGML_F16C=ON \
        -DGGML_OPENMP=ON \
        ${BLAS_FLAG} \
        -DLLAMA_CURL=ON \
        -DLLAMA_OPENSSL=ON && \
    cmake --build build --config Release --target llama-server -j"${JOBS}" && \
    echo "=== Binary dependencies ===" && \
    ldd build/bin/llama-server || true && \
    mkdir -p /build/llama-libs && \
    find build -type f \( -name '*.so' -o -name '*.so.*' \) -exec cp -v {} /build/llama-libs/ \; || true

# Build C++ manager (Boost.Beast + JSON)
COPY cpp/ /build/cpp/
RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
    -I/build/cpp \
    /build/cpp/config.cpp \
    /build/cpp/http_helpers.cpp \
    /build/cpp/llm_manager.cpp \
    /build/cpp/llm_manager_types.cpp \
    /build/cpp/model_manager.cpp \
    /build/cpp/request_parsing.cpp \
    /build/cpp/runtime_components.cpp \
    /build/cpp/server.cpp \
    -o /build/llm-manager

# Runtime stage
FROM debian:bookworm-slim

# Install runtime dependencies including SSL/HTTPS support
RUN apt-get update && apt-get install -y --no-install-recommends \
    libcurl4 \
    ca-certificates \
    libgomp1 \
    libopenblas0-pthread \
    libstdc++6 \
    openssl \
    && rm -rf /var/lib/apt/lists/*

# Copy llama-server binary and all shared libraries from builder
COPY --from=builder /build/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server
COPY --from=builder /build/llama-libs/ /usr/local/lib/
COPY --from=builder /build/llm-manager /usr/local/bin/llm-manager

# Update library cache
RUN ldconfig

# Install Python and FastAPI dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 \
    python3-pip \
    && rm -rf /var/lib/apt/lists/*

# Install Python packages
RUN pip3 install --no-cache-dir fastapi uvicorn aiohttp pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages

# Create non-root user
RUN useradd -m -u 1000 user && \
    mkdir -p /home/user/.cache/llama.cpp && \
    chown -R user:user /home/user

# Copy application code
COPY --chown=user:user python/ /home/user/python/

USER user
WORKDIR /home/user

# Set environment variables
ENV HOME=/home/user \
    LLAMA_CACHE=/home/user/.cache/llama.cpp \
    PATH=/home/user/.local/bin:$PATH \
    PYTHONPATH=/home/user/python \
    PYTHONUNBUFFERED=1

EXPOSE 7860

# Start FastAPI app (which manages llama-server internally)
#CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

# --- Optional: run llama.cpp C++ server directly (temporary rollout) ---
# Keep the FastAPI CMD above as the default. Uncomment ONE of the following
# to run the C++ server directly instead of the Python app.
#
# Example DeepSeek (4k context):
# CMD ["llama-server", "-hf", "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf",
#      "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"]
#
# Example Mixtral-8x7B-Instruct (known loader incompatibilities on newer llama.cpp + too large for 10GB RAM):
# CMD ["llama-server", "-hf", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "64", "--ubatch-size", "32"]

# CMD ["llama-server", "-hf", "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m", "--host", "0.0.0.0", "--port", "7860", "-c", "8192", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "128", "--ubatch-size", "64"]
#
# Active manager process:
# - loads default model at startup
# - supports /switch-model runtime model change
# - proxies /v1/chat/completions to active worker
ENV DEFAULT_MODEL=QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m \
    MANAGER_HOST=0.0.0.0 \
    MANAGER_PORT=7860 \
    WORKER_BASE_PORT=8080 \
    SWITCH_TIMEOUT_SEC=300 \
    REQUEST_TIMEOUT_SEC=300 \
    DEFAULT_MAX_TOKENS=2048 \
    MAX_TOKENS_PER_REQUEST=4096 \
    MODEL_N_CTX=8192 \
    MODEL_THREADS=4 \
    MODEL_NGL=0 \
    MODEL_BATCH=64 \
    MODEL_UBATCH=32

CMD ["llm-manager"]
#
# Example Qwen2.5-Coder 7B Instruct (32k context):
# CMD ["llama-server", "-hf", "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Qwen2.5-Coder-7B-Instruct.Q4_K_M.gguf",
#      "--host", "0.0.0.0", "--port", "7860", "-c", "32768", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"]