Spaces:

DarkMindForever
/

qwei3

Runtime error

qwei3 / Dockerfile

Update Dockerfile

027f6f1 verified about 1 month ago

1.2 kB

	FROM ghcr.io/ggml-org/llama.cpp:server

	USER root
	RUN apt-get update && apt-get install -y curl

	# Download Gemma-3n-E2B GGUF
	RUN mkdir -p /models && \
	curl -L https://huggingface.co/unsloth/gemma-3n-E2B-it-GGUF/resolve/main/gemma-3n-E2B-it-Q4_0.gguf -o /models/model.gguf && \
	chown -R 1000:1000 /models

	USER 1000

	# High-Performance Server Configuration
	ENV LLAMA_ARG_MODEL=/models/model.gguf
	ENV LLAMA_ARG_HOST=0.0.0.0
	ENV LLAMA_ARG_PORT=7860
	ENV LLAMA_ARG_CTX_SIZE=8192

	# --- PERFORMANCE TUNING ---
	# 1. Threading: Match physical cores. (Standard 16GB PC usually has 8-12 cores)
	ENV LLAMA_ARG_THREADS=8
	# 2. Flash Attention: Critical for speed (30% + boost)
	ENV LLAMA_ARG_FLASH_ATTN=true
	# 3. No MMap: Sometimes faster on RAM-constrained systems to load fully into memory
	ENV LLAMA_ARG_NO_MMAP=false
	# 4. Memory Locking: Prevents the OS from swapping the model to disk
	ENV LLAMA_ARG_MLOCK=true
	# 5. Batching: Smaller batches for faster single-user response
	ENV LLAMA_ARG_BATCH_SIZE=512
	ENV LLAMA_ARG_UBATCH_SIZE=128

	HEALTHCHECK --interval=30s --timeout=15s --start-period=10s --retries=3 \
	CMD curl -f http://localhost:7860/health \|\| exit 1

	ENTRYPOINT ["/app/llama-server"]