FROM archlinux:latest ENV DEBIAN_FRONTEND=noninteractive # passed from space environment ARG MODEL_ID="unsloth/Qwen3-0.6B-GGUF" ARG QUANT="Q8_0" ARG SERVED_NAME="qwen-nano" ARG PARALLEL=4 ARG CTX_SIZE=4096 ARG EMBEDDING_ONLY=0 ARG RERANK_ONLY=0 # llama.cpp env configs ENV LLAMA_ARG_HF_REPO="${MODEL_ID}:${QUANT}" ENV LLAMA_ARG_CTX_SIZE=${CTX_SIZE} ENV LLAMA_ARG_BATCH=512 ENV LLAMA_ARG_N_PARALLEL=${PARALLEL} ENV LLAMA_ARG_FLASH_ATTN=on # ENV LLAMA_ARG_CACHE_TYPE_K="q8_0" # ENV LLAMA_ARG_CACHE_TYPE_V="q4_1" ENV LLAMA_ARG_MLOCK=1 ENV LLAMA_ARG_N_GPU_LAYERS=0 ENV LLAMA_ARG_HOST="0.0.0.0" ENV LLAMA_ARG_PORT=7860 ENV LLAMA_ARG_ALIAS="${SERVED_NAME}" ENV LLAMA_ARG_EMBEDDINGS=${EMBEDDING_ONLY} ENV LLAMA_ARG_RERANKING=${RERANK_ONLY} ENV LLAMA_ARG_ENDPOINT_METRICS=1 RUN pacman -Syu --noconfirm --overwrite '*' RUN pacman -S base-devel git git-lfs cmake curl openblas openblas64 blas64-openblas python gcc-libs glibc --noconfirm --overwrite '*' RUN mkdir -p /app && mkdir -p /.cache # cache dir for llama.cpp to download models RUN chmod -R 777 /.cache WORKDIR /app RUN git clone --depth 1 --single-branch --branch master https://github.com/ggml-org/llama.cpp.git # RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git llama.cpp WORKDIR /app/llama.cpp RUN cmake -B build \ -DGGML_LTO=ON \ -DLLAMA_CURL=ON \ -DLLAMA_BUILD_SERVER=ON \ -DLLAMA_BUILD_EXAMPLES=ON \ -DGGML_ALL_WARNINGS=OFF \ -DGGML_ALL_WARNINGS_3RD_PARTY=OFF \ -DGGML_BLAS=ON \ -DGGML_BLAS_VENDOR=OpenBLAS \ -DGGML_NATIVE=ON \ -DGGML_LLAMAFILE=ON \ -Wno-dev \ -DCMAKE_BUILD_TYPE=Release RUN cmake --build build --config Release --target llama-server -j $(nproc) WORKDIR /app EXPOSE 7860 CMD ["/app/llama.cpp/build/bin/llama-server", "--verbose-prompt", "--prio", "3"]