FROM debian:bookworm-slim AS builder

# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    build-essential \
    cmake \
    ninja-build \
    ca-certificates \
    libcurl4-openssl-dev \
    libssl-dev \
    libboost-dev \
    libopenblas-dev \
    nlohmann-json3-dev \
    pkg-config \
    && rm -rf /var/lib/apt/lists/*

# Clone and build llama.cpp with SSL support for HuggingFace Hub
WORKDIR /build
ARG CACHEBUST=6
ARG LLAMA_CPP_REF=master
#ARG BUILD_PROFILE=fast_build
ARG BUILD_PROFILE=fast_runtime
ARG BUILD_JOBS=1
RUN git clone --depth 1 --branch ${LLAMA_CPP_REF} https://github.com/ggerganov/llama.cpp.git && \
    cd llama.cpp && \
    if [ "${BUILD_PROFILE}" = "fast_runtime" ]; then \
        C_FLAGS="-O3 -DNDEBUG"; \
        CXX_FLAGS="-O3 -DNDEBUG"; \
        BLAS_FLAG="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"; \
        JOBS="${BUILD_JOBS}"; \
    else \
        C_FLAGS="-O1 -DNDEBUG"; \
        CXX_FLAGS="-O1 -DNDEBUG"; \
        BLAS_FLAG="-DGGML_BLAS=OFF"; \
        JOBS="1"; \
    fi && \
    cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release \
        -DCMAKE_C_FLAGS_RELEASE="${C_FLAGS}" \
        -DCMAKE_CXX_FLAGS_RELEASE="${CXX_FLAGS}" \
        -DLLAMA_BUILD_TESTS=OFF \
        -DLLAMA_BUILD_EXAMPLES=OFF \
        -DLLAMA_BUILD_SERVER=ON \
        -DGGML_NATIVE=OFF \
        -DGGML_AVX2=ON \
        -DGGML_AVX=ON \
        -DGGML_FMA=ON \
        -DGGML_F16C=ON \
        -DGGML_OPENMP=ON \
        ${BLAS_FLAG} \
        -DLLAMA_CURL=ON \
        -DLLAMA_OPENSSL=ON && \
    cmake --build build --config Release --target llama-server -j"${JOBS}" && \
    echo "=== Binary dependencies ===" && \
    ldd build/bin/llama-server || true && \
    mkdir -p /build/llama-libs && \
    find build -type f \( -name '*.so' -o -name '*.so.*' \) -exec cp -v {} /build/llama-libs/ \; || true

# Build C++ manager (Boost.Beast + JSON)
COPY cpp/ /build/cpp/
RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
    -I/build/cpp \
    /build/cpp/config.cpp \
    /build/cpp/http_helpers.cpp \
    /build/cpp/llm_manager.cpp \
    /build/cpp/llm_manager_types.cpp \
    /build/cpp/model_manager.cpp \
    /build/cpp/request_parsing.cpp \
    /build/cpp/runtime_components.cpp \
    /build/cpp/server.cpp \
    -o /build/llm-manager

# Runtime stage
FROM debian:bookworm-slim

# Install runtime dependencies including SSL/HTTPS support
RUN apt-get update && apt-get install -y --no-install-recommends \
    libcurl4 \
    ca-certificates \
    libgomp1 \
    libopenblas0-pthread \
    libstdc++6 \
    openssl \
    && rm -rf /var/lib/apt/lists/*

# Copy llama-server binary and all shared libraries from builder
COPY --from=builder /build/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server
COPY --from=builder /build/llama-libs/ /usr/local/lib/
COPY --from=builder /build/llm-manager /usr/local/bin/llm-manager

# Update library cache
RUN ldconfig

# Install Python and FastAPI dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 \
    python3-pip \
    && rm -rf /var/lib/apt/lists/*

# Install Python packages
RUN pip3 install --no-cache-dir fastapi uvicorn aiohttp pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages

# Create non-root user
RUN useradd -m -u 1000 user && \
    mkdir -p /home/user/.cache/llama.cpp && \
    chown -R user:user /home/user

# Copy application code
COPY --chown=user:user python/ /home/user/python/

USER user
WORKDIR /home/user

# Set environment variables
ENV HOME=/home/user \
    LLAMA_CACHE=/home/user/.cache/llama.cpp \
    PATH=/home/user/.local/bin:$PATH \
    PYTHONPATH=/home/user/python \
    PYTHONUNBUFFERED=1

EXPOSE 7860

# Start FastAPI app (which manages llama-server internally)
#CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

# --- Optional: run llama.cpp C++ server directly (temporary rollout) ---
# Keep the FastAPI CMD above as the default. Uncomment ONE of the following
# to run the C++ server directly instead of the Python app.
#
# Example DeepSeek (4k context):
# CMD ["llama-server", "-hf", "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf",
#      "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"]
#
# Example Mixtral-8x7B-Instruct (known loader incompatibilities on newer llama.cpp + too large for 10GB RAM):
# CMD ["llama-server", "-hf", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "64", "--ubatch-size", "32"]

# CMD ["llama-server", "-hf", "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m", "--host", "0.0.0.0", "--port", "7860", "-c", "8192", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "128", "--ubatch-size", "64"]
#
# Active manager process:
# - loads default model at startup
# - supports /switch-model runtime model change
# - proxies /v1/chat/completions to active worker
ENV DEFAULT_MODEL=QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m \
    MANAGER_HOST=0.0.0.0 \
    MANAGER_PORT=7860 \
    WORKER_BASE_PORT=8080 \
    SWITCH_TIMEOUT_SEC=300 \
    REQUEST_TIMEOUT_SEC=300 \
    DEFAULT_MAX_TOKENS=2048 \
    MAX_TOKENS_PER_REQUEST=4096 \
    MODEL_N_CTX=8192 \
    MODEL_THREADS=4 \
    MODEL_NGL=0 \
    MODEL_BATCH=64 \
    MODEL_UBATCH=32

CMD ["llm-manager"]
#
# Example Qwen2.5-Coder 7B Instruct (32k context):
# CMD ["llama-server", "-hf", "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Qwen2.5-Coder-7B-Instruct.Q4_K_M.gguf",
#      "--host", "0.0.0.0", "--port", "7860", "-c", "32768", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"]