File size: 1,469 Bytes
f5b5b66
 
7e4778c
 
f5b5b66
 
 
 
7e4778c
f5b5b66
 
7e4778c
f5b5b66
 
7e4778c
f5b5b66
 
 
 
 
 
7e4778c
 
4cb0463
 
f5b5b66
 
 
 
 
c802a0e
4cb0463
 
 
 
c802a0e
 
 
f5b5b66
 
 
 
 
 
c802a0e
f5b5b66
 
c4c2b5b
f5b5b66
 
 
 
c4c2b5b
ab4bbe1
f5b5b66
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
FROM debian:bookworm-slim

# 1. Install dependencies
# Added pkg-config to fix the "Could NOT find PkgConfig" error
RUN apt-get update && apt-get install -y \
    build-essential \
    cmake \
    git \
    pkg-config \
    libcurl4-openssl-dev \
    libssl-dev \
    libopenblas-dev \
    && rm -rf /var/lib/apt/lists/*

# 2. Setup Hugging Face User
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user \
    PATH=/home/user/.local/bin:$PATH
WORKDIR $HOME/app

# 3. Clone and Build ONLY llama-server
# Using -j 2 to prevent the 54% hang issue caused by RAM exhaustion
RUN git clone --depth 1 https://github.com/ggerganov/llama.cpp.git . && \
    cmake -B build \
    -DCMAKE_BUILD_TYPE=Release \
    -DGGML_NATIVE=ON \
    -DGGML_AVX512=ON \
    -DGGML_AVX512_VNNI=ON \
    -DGGML_OPENMP=ON \
    -DGGML_BLAS=ON \
    -DGGML_BLAS_VENDOR=OpenBLAS \
    -DGGML_CURL=ON && \
    cmake --build build --config Release --target llama-server -j 8
    

    
    

# 4. Final Server Configuration
# -t 8: Optimized for your 8 physical cores (prevents hyperthreading slowdowns)
# -hf: Pulls directly from Hugging Face
# --host 0.0.0.0: Required for Hugging Face Spaces networking
# --flash-attn: Uses AVX-512 optimized attention kernels
ENTRYPOINT ["./build/bin/llama-server"]

CMD [ \
    "-hf", "unsloth/Qwen3.5-4B-GGUF:Q8_0", \
    "--host", "0.0.0.0", \
    "--port", "7860", \
    "-t", "8", \
    "-c", "4096", \
    "--flash-attn", "true", \
    "--no-mmap" \
    ]