waddie commited on
Commit
601ed38
·
verified ·
1 Parent(s): 95c1b8c

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +10 -23
Dockerfile CHANGED
@@ -1,31 +1,18 @@
1
- FROM python:3.10-slim
 
2
 
3
- # Install ONLY wget to fetch your model file (No compiler tools needed!)
 
4
  RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
5
 
6
- # Pull down the ultra-fast uv binary
7
- COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
8
-
9
- WORKDIR /app
10
-
11
- # Pin execution explicitly to pure CPU backends
12
- ENV LLAMA_GGML_BACKEND=cpu
13
-
14
- # Install the official, pre-compiled x86_64 CPU wheel instantly via UV
15
- RUN uv pip install --system --no-cache \
16
- --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu \
17
- "llama-cpp-python[server]"
18
-
19
- # Stream down your 4.68 GB model file
20
- RUN wget -O model.gguf "https://huggingface.co/waddie/mini-2.0-GGUF/resolve/main/mini-2.0-Q4_K_M.gguf"
21
 
22
  EXPOSE 7860
23
 
24
- # Revert to executing via python's native server module
25
- CMD ["python3", "-m", "llama_cpp.server", \
26
- "--model", "model.gguf", \
27
  "--host", "0.0.0.0", \
28
  "--port", "7860", \
29
- "--n_threads", "2", \
30
- "--n_ctx", "4096", \
31
- "--chat_format", "chatml"]
 
1
+ # Pull the official, pre-compiled C++ server image
2
+ FROM ghcr.io/ggml-org/llama.cpp:server
3
 
4
+ # Temporarily switch to root to install wget
5
+ USER root
6
  RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
7
 
8
+ # Download your GGUF model
9
+ RUN wget -O /model.gguf "https://huggingface.co/waddie/mini-2.0-GGUF/resolve/main/mini-2.0-Q4_K_M.gguf"
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  EXPOSE 7860
12
 
13
+ # Run the native server using the arguments from the docs you linked
14
+ # (The image's ENTRYPOINT is automatically the llama-server binary)
15
+ CMD ["--model", "/model.gguf", \
16
  "--host", "0.0.0.0", \
17
  "--port", "7860", \
18
+ "--ctx-size", "4096"]