PreethiCarmelBosco commited on
Commit
f18f27b
·
verified ·
1 Parent(s): 42db10c
Files changed (1) hide show
  1. Dockerfile +26 -17
Dockerfile CHANGED
@@ -1,9 +1,10 @@
1
- # --- 1. Use the official Ollama pre-built image ---
2
- FROM ollama/ollama
3
 
4
- # --- 2. Install Python, pip, and the venv package ---
5
- # The base image is Debian, so we can use apt-get
6
- RUN apt-get update && apt-get install -y python3 python3-pip python3-venv
 
7
 
8
  # --- 3. Download the GGUF model using a safe venv ---
9
  WORKDIR /app
@@ -17,17 +18,25 @@ RUN --mount=type=secret,id=HF_TOKEN \
17
  pip install huggingface_hub && \
18
  python3 download_model.py'
19
 
20
- # --- 4. Create the Ollama "Modelfile" ---
21
- # This file tells Ollama to use our downloaded GGUF
22
- RUN echo "FROM /app/prem-1B-SQL.Q8_0.gguf" > /app/Modelfile
 
 
 
23
 
24
- # --- 5. Import the model into Ollama's registry ---
25
- # We start the server in the background (&), wait 5s for it to boot,
26
- # then run the 'create' command. The 'ollama serve' process
27
- # will automatically end when this RUN step completes.
28
- RUN sh -c 'ollama serve & \
29
- sleep 5 && \
30
- ollama create prem-sql-api -f /app/Modelfile'
31
 
32
- # The base image's default command ("ollama serve") will
33
- # now run and serve the "prem-sql-api" model we just created.
 
 
 
 
 
 
 
 
 
 
1
+ # --- 1. Use a clean Python base image ---
2
+ FROM python:3.12-slim
3
 
4
+ # --- 2. Install build tools & python-venv ---
5
+ # This is needed to compile llama-cpp-python from source
6
+ # and to create a safe venv for downloading
7
+ RUN apt-get update && apt-get install -y build-essential cmake python3-venv
8
 
9
  # --- 3. Download the GGUF model using a safe venv ---
10
  WORKDIR /app
 
18
  pip install huggingface_hub && \
19
  python3 download_model.py'
20
 
21
+ # --- 4. Build llama-cpp-python (THE FAST, CPU-ONLY WAY) ---
22
+ # This is the magic fix:
23
+ # It tells the compiler to NOT build the heavy CUDA/GPU libraries.
24
+ # This will prevent the build from timing out.
25
+ ENV CMAKE_ARGS="-DLLAMA_CUDA=OFF"
26
+ RUN pip install "llama-cpp-python[server]"
27
 
28
+ # --- 5. Run the Server ---
29
+ # Expose port 8000 (which we defined in README.md)
30
+ EXPOSE 8000
 
 
 
 
31
 
32
+ # This is the command that will run when the container starts
33
+ # It reads the API_KEY secret from the environment
34
+ CMD [ \
35
+ "python", \
36
+ "-m", "llama_cpp.server", \
37
+ "--model", "/app/prem-1B-SQL.Q8_0.gguf", \
38
+ "--n_gpu_layers", "0", \
39
+ "--port", "8000", \
40
+ "--host", "0.0.0.0", \
41
+ "--api_key_env_var", "API_KEY" \
42
+ ]