Spaces:

anews9340
/

llama.cpp

Paused

anews9340 commited on Mar 2

Commit

c802a0e

verified ·

1 Parent(s): 5b8a4b3

Update Dockerfile

Files changed (1) hide show

Dockerfile CHANGED Viewed

@@ -24,24 +24,28 @@ WORKDIR $HOME/app
 # -DGGML_NATIVE=ON: Targets your specific Ice Lake instructions
 # -DGGML_AVX512 & VNNI: Uses the hardware acceleration flags found in your cpuinfo
 # -DGGML_CURL=ON: Enables the -hf downloading capability
-RUN git clone --depth 1 https://github.com/ggerganov/llama.cpp.git . && \
-    cmake -B build \
     -DCMAKE_BUILD_TYPE=Release \
     -DGGML_NATIVE=ON \
     -DGGML_AVX512=ON \
     -DGGML_AVX512_VNNI=ON \
     -DGGML_OPENMP=ON \
-    -DGGML_CURL=ON && \
-    cmake --build build --config Release -j 16 && \
-    cp build/bin/llama-server . && \
-    rm -rf build # Clean up build artifacts to reduce image size
 # 4. Final Server Configuration
 # -t 8: Optimized for your 8 physical cores (prevents hyperthreading slowdowns)
 # -hf: Pulls directly from Hugging Face
 # --host 0.0.0.0: Required for Hugging Face Spaces networking
 # --flash-attn: Uses AVX-512 optimized attention kernels
-ENTRYPOINT ["./llama-server"]
 CMD [ \
     "-hf", "unsloth/Qwen3.5-9B-GGUF:Q8_0", \

 # -DGGML_NATIVE=ON: Targets your specific Ice Lake instructions
 # -DGGML_AVX512 & VNNI: Uses the hardware acceleration flags found in your cpuinfo
 # -DGGML_CURL=ON: Enables the -hf downloading capability
+RUN git clone --depth 1 https://github.com/ggerganov/llama.cpp.git .
+RUN cmake -B build \
     -DCMAKE_BUILD_TYPE=Release \
     -DGGML_NATIVE=ON \
     -DGGML_AVX512=ON \
     -DGGML_AVX512_VNNI=ON \
     -DGGML_OPENMP=ON \
+    -DGGML_BLAS=ON \
+    -DGGML_BLAS_VENDOR=OpenBLAS \
+    -DGGML_CURL=ON
+RUN cmake --build build --config --target llama-server Release -j 8
+RUN cp build/bin/llama-server . && \
 # 4. Final Server Configuration
 # -t 8: Optimized for your 8 physical cores (prevents hyperthreading slowdowns)
 # -hf: Pulls directly from Hugging Face
 # --host 0.0.0.0: Required for Hugging Face Spaces networking
 # --flash-attn: Uses AVX-512 optimized attention kernels
+ENTRYPOINT ["./build/bin/llama-server"]
 CMD [ \
     "-hf", "unsloth/Qwen3.5-9B-GGUF:Q8_0", \