Spaces:

ResearchEngineering
/

AGI

Sleeping

AGI / Dockerfile

Dmitry Beresnev

change timeouts

2c31416 3 months ago

5.72 kB

	FROM debian:bookworm-slim AS builder

	# Install build dependencies
	RUN apt-get update && apt-get install -y --no-install-recommends \
	git \
	build-essential \
	cmake \
	ninja-build \
	ca-certificates \
	libcurl4-openssl-dev \
	libssl-dev \
	libboost-dev \
	libopenblas-dev \
	nlohmann-json3-dev \
	pkg-config \
	&& rm -rf /var/lib/apt/lists/*

	# Clone and build llama.cpp with SSL support for HuggingFace Hub
	WORKDIR /build
	ARG CACHEBUST=6
	ARG LLAMA_CPP_REF=master
	#ARG BUILD_PROFILE=fast_build
	ARG BUILD_PROFILE=fast_runtime
	ARG BUILD_JOBS=1
	RUN git clone --depth 1 --branch ${LLAMA_CPP_REF} https://github.com/ggerganov/llama.cpp.git && \
	cd llama.cpp && \
	if [ "${BUILD_PROFILE}" = "fast_runtime" ]; then \
	C_FLAGS="-O3 -DNDEBUG"; \
	CXX_FLAGS="-O3 -DNDEBUG"; \
	BLAS_FLAG="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"; \
	JOBS="${BUILD_JOBS}"; \
	else \
	C_FLAGS="-O1 -DNDEBUG"; \
	CXX_FLAGS="-O1 -DNDEBUG"; \
	BLAS_FLAG="-DGGML_BLAS=OFF"; \
	JOBS="1"; \
	fi && \
	cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release \
	-DCMAKE_C_FLAGS_RELEASE="${C_FLAGS}" \
	-DCMAKE_CXX_FLAGS_RELEASE="${CXX_FLAGS}" \
	-DLLAMA_BUILD_TESTS=OFF \
	-DLLAMA_BUILD_EXAMPLES=OFF \
	-DLLAMA_BUILD_SERVER=ON \
	-DGGML_NATIVE=OFF \
	-DGGML_AVX2=ON \
	-DGGML_AVX=ON \
	-DGGML_FMA=ON \
	-DGGML_F16C=ON \
	-DGGML_OPENMP=ON \
	${BLAS_FLAG} \
	-DLLAMA_CURL=ON \
	-DLLAMA_OPENSSL=ON && \
	cmake --build build --config Release --target llama-server -j"${JOBS}" && \
	echo "=== Binary dependencies ===" && \
	ldd build/bin/llama-server \|\| true && \
	mkdir -p /build/llama-libs && \
	find build -type f $ -name '.so' -o -name '.so.*' $ -exec cp -v {} /build/llama-libs/ \; \|\| true

	# Build C++ manager (Boost.Beast + JSON)
	COPY cpp/ /build/cpp/
	RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
	-I/build/cpp \
	/build/cpp/config.cpp \
	/build/cpp/http_helpers.cpp \
	/build/cpp/llm_manager.cpp \
	/build/cpp/llm_manager_types.cpp \
	/build/cpp/model_manager.cpp \
	/build/cpp/request_parsing.cpp \
	/build/cpp/runtime_components.cpp \
	/build/cpp/server.cpp \
	-o /build/llm-manager

	# Runtime stage
	FROM debian:bookworm-slim

	# Install runtime dependencies including SSL/HTTPS support
	RUN apt-get update && apt-get install -y --no-install-recommends \
	libcurl4 \
	ca-certificates \
	libgomp1 \
	libopenblas0-pthread \
	libstdc++6 \
	openssl \
	&& rm -rf /var/lib/apt/lists/*

	# Copy llama-server binary and all shared libraries from builder
	COPY --from=builder /build/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server
	COPY --from=builder /build/llama-libs/ /usr/local/lib/
	COPY --from=builder /build/llm-manager /usr/local/bin/llm-manager

	# Update library cache
	RUN ldconfig

	# Install Python and FastAPI dependencies
	RUN apt-get update && apt-get install -y --no-install-recommends \
	python3 \
	python3-pip \
	&& rm -rf /var/lib/apt/lists/*

	# Install Python packages
	RUN pip3 install --no-cache-dir fastapi uvicorn aiohttp pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages

	# Create non-root user
	RUN useradd -m -u 1000 user && \
	mkdir -p /home/user/.cache/llama.cpp && \
	chown -R user:user /home/user

	# Copy application code
	COPY --chown=user:user python/ /home/user/python/

	USER user
	WORKDIR /home/user

	# Set environment variables
	ENV HOME=/home/user \
	LLAMA_CACHE=/home/user/.cache/llama.cpp \
	PATH=/home/user/.local/bin:$PATH \
	PYTHONPATH=/home/user/python \
	PYTHONUNBUFFERED=1

	EXPOSE 7860

	# Start FastAPI app (which manages llama-server internally)
	#CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

	# --- Optional: run llama.cpp C++ server directly (temporary rollout) ---
	# Keep the FastAPI CMD above as the default. Uncomment ONE of the following
	# to run the C++ server directly instead of the Python app.
	#
	# Example DeepSeek (4k context):
	# CMD ["llama-server", "-hf", "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf",
	# "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"]
	#
	# Example Mixtral-8x7B-Instruct (known loader incompatibilities on newer llama.cpp + too large for 10GB RAM):
	# CMD ["llama-server", "-hf", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "64", "--ubatch-size", "32"]

	# CMD ["llama-server", "-hf", "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m", "--host", "0.0.0.0", "--port", "7860", "-c", "8192", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "128", "--ubatch-size", "64"]
	#
	# Active manager process:
	# - loads default model at startup
	# - supports /switch-model runtime model change
	# - proxies /v1/chat/completions to active worker
	ENV DEFAULT_MODEL=QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m \
	MANAGER_HOST=0.0.0.0 \
	MANAGER_PORT=7860 \
	WORKER_BASE_PORT=8080 \
	SWITCH_TIMEOUT_SEC=300 \
	REQUEST_TIMEOUT_SEC=300 \
	DEFAULT_MAX_TOKENS=2048 \
	MAX_TOKENS_PER_REQUEST=4096 \
	MODEL_N_CTX=8192 \
	MODEL_THREADS=4 \
	MODEL_NGL=0 \
	MODEL_BATCH=64 \
	MODEL_UBATCH=32

	CMD ["llm-manager"]
	#
	# Example Qwen2.5-Coder 7B Instruct (32k context):
	# CMD ["llama-server", "-hf", "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Qwen2.5-Coder-7B-Instruct.Q4_K_M.gguf",
	# "--host", "0.0.0.0", "--port", "7860", "-c", "32768", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "512"]