ubix commited on
Commit
f00f318
·
verified ·
1 Parent(s): f02f4b1

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +17 -99
Dockerfile CHANGED
@@ -1,106 +1,24 @@
1
- # Dockerfile for Qwen2.5-Coder-7B-Instruct-GGUF on Hugging Face Spaces (Free Tier)
2
- # Optimized for: 2 vCPU, 16GB RAM, CPU-only
3
- # Uses llama-cpp-python for OpenAI-compatible API
4
 
5
- FROM python:3.11-slim-bookworm
6
-
7
- # Install system dependencies
8
  RUN apt-get update && apt-get install -y --no-install-recommends \
9
- build-essential \
10
- cmake \
11
- curl \
12
- libopenblas-dev \
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
- # Set environment variables for build optimization
16
- ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
17
- ENV FORCE_CMAKE=1
18
-
19
- # Install llama-cpp-python with OpenBLAS acceleration (CPU-optimized)
20
- RUN pip install --no-cache-dir llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
21
-
22
- # Install huggingface-hub for model download
23
- RUN pip install --no-cache-dir huggingface-hub gradio
24
-
25
- # Set working directory
26
- WORKDIR /app
27
-
28
- # Download Q4_K_M quantized model (smallest viable for coding tasks)
29
- # Q4_K_M: ~4.7GB, good balance for 16GB RAM limit
30
- RUN python3 -c "from huggingface_hub import hf_hub_download; \
31
- hf_hub_download( \
32
- repo_id='Qwen/Qwen2.5-Coder-7B-Instruct-GGUF', \
33
- filename='qwen2.5-coder-7b-instruct-q4_k_m.gguf', \
34
- local_dir='/app/models', \
35
- local_dir_use_symlinks=False \
36
- )"
37
-
38
- # Create Gradio app.py for Hugging Face Spaces interface
39
- RUN cat > /app/app.py << 'EOF'
40
- import gradio as gr
41
- from llama_cpp import Llama
42
- import os
43
 
44
- # Initialize model with aggressive CPU optimization
45
- llm = Llama(
46
- model_path="/app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
47
- n_ctx=4096, # Reduced context for memory efficiency
48
- n_batch=512, # Smaller batch size for 2 vCPU
49
- n_threads=2, # Match Hugging Face free tier vCPUs
50
- n_threads_batch=2,
51
- verbose=False,
52
- use_mmap=True, # Memory mapping for efficient loading
53
- use_mlock=False # Disable memory locking for container compatibility
54
- )
55
-
56
- SYSTEM_PROMPT = "You are Qwen2.5-Coder, a helpful AI coding assistant. Provide concise, accurate code solutions."
57
-
58
- def generate_code(message, history):
59
- # Format with ChatML template
60
- messages = [{"role": "system", "content": SYSTEM_PROMPT}]
61
-
62
- # Add history
63
- for human, assistant in history:
64
- messages.append({"role": "user", "content": human})
65
- messages.append({"role": "assistant", "content": assistant})
66
-
67
- messages.append({"role": "user", "content": message})
68
-
69
- # Generate with conservative parameters for free tier
70
- output = llm.create_chat_completion(
71
- messages=messages,
72
- max_tokens=1024, # Limit response length
73
- temperature=0.7,
74
- top_p=0.9,
75
- repeat_penalty=1.1,
76
- stop=["<|im_end|>", "<|im_start|>"]
77
- )
78
-
79
- return output["choices"][0]["message"]["content"]
80
-
81
- # Create Gradio interface
82
- demo = gr.ChatInterface(
83
- fn=generate_code,
84
- title="Qwen2.5-Coder-7B",
85
- description="Optimized for Hugging Face Spaces Free Tier (2 vCPU, 16GB RAM)",
86
- examples=[
87
- "Write a Python function to reverse a string",
88
- "Explain how quicksort works with code",
89
- "Create a simple REST API with Flask"
90
- ],
91
- cache_examples=False # Disable to save memory
92
- )
93
-
94
- if __name__ == "__main__":
95
- demo.launch(server_name="0.0.0.0", server_port=7860)
96
- EOF
97
-
98
- # Expose Hugging Face Spaces port
99
  EXPOSE 7860
100
 
101
- # Health check
102
- HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
103
- CMD curl -f http://localhost:7860/ || exit 1
104
-
105
- # Run the Gradio app
106
- CMD ["python3", "/app/app.py"]
 
 
 
 
1
+ FROM ghcr.io/ggml-org/llama.cpp:full
 
 
2
 
3
+ # Install wget and other dependencies
 
 
4
  RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ wget \
 
 
 
6
  && rm -rf /var/lib/apt/lists/*
7
 
8
+ # Download the specific GGUF file (not the HTML page)
9
+ # Using Q4_K_M for optimal balance on free tier (2 vCPU, 16GB RAM)
10
+ RUN wget --no-check-certificate "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf" \
11
+ -O /model.gguf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # Expose Hugging Face Spaces default port
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  EXPOSE 7860
15
 
16
+ # Run llama.cpp server with optimized settings for free tier
17
+ ENTRYPOINT ["/llama-server"]
18
+ CMD ["-m", "/model.gguf", \
19
+ "--port", "7860", \
20
+ "--host", "0.0.0.0", \
21
+ "-c", "4096", \
22
+ "-n", "512", \
23
+ "--threads", "2", \
24
+ "--threads-batch", "2"]