ubix commited on
Commit
f02f4b1
·
verified ·
1 Parent(s): f933057

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +103 -4
Dockerfile CHANGED
@@ -1,7 +1,106 @@
1
- FROM ghcr.io/ggml-org/llama.cpp:full
 
 
2
 
3
- RUN apt update && apt install wget -y
4
 
5
- RUN wget "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF" -O /Qwen2.5-Coder-7B.gguf
 
 
 
 
 
 
6
 
7
- CMD ["--server", "-m", "/Qwen2.5-Coder-7B.gguf", "--port", "7860", "--host", "0.0.0.0", "-n", "4096"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile for Qwen2.5-Coder-7B-Instruct-GGUF on Hugging Face Spaces (Free Tier)
2
+ # Optimized for: 2 vCPU, 16GB RAM, CPU-only
3
+ # Uses llama-cpp-python for OpenAI-compatible API
4
 
5
+ FROM python:3.11-slim-bookworm
6
 
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ build-essential \
10
+ cmake \
11
+ curl \
12
+ libopenblas-dev \
13
+ && rm -rf /var/lib/apt/lists/*
14
 
15
+ # Set environment variables for build optimization
16
+ ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
17
+ ENV FORCE_CMAKE=1
18
+
19
+ # Install llama-cpp-python with OpenBLAS acceleration (CPU-optimized)
20
+ RUN pip install --no-cache-dir llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
21
+
22
+ # Install huggingface-hub for model download
23
+ RUN pip install --no-cache-dir huggingface-hub gradio
24
+
25
+ # Set working directory
26
+ WORKDIR /app
27
+
28
+ # Download Q4_K_M quantized model (smallest viable for coding tasks)
29
+ # Q4_K_M: ~4.7GB, good balance for 16GB RAM limit
30
+ RUN python3 -c "from huggingface_hub import hf_hub_download; \
31
+ hf_hub_download( \
32
+ repo_id='Qwen/Qwen2.5-Coder-7B-Instruct-GGUF', \
33
+ filename='qwen2.5-coder-7b-instruct-q4_k_m.gguf', \
34
+ local_dir='/app/models', \
35
+ local_dir_use_symlinks=False \
36
+ )"
37
+
38
+ # Create Gradio app.py for Hugging Face Spaces interface
39
+ RUN cat > /app/app.py << 'EOF'
40
+ import gradio as gr
41
+ from llama_cpp import Llama
42
+ import os
43
+
44
+ # Initialize model with aggressive CPU optimization
45
+ llm = Llama(
46
+ model_path="/app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
47
+ n_ctx=4096, # Reduced context for memory efficiency
48
+ n_batch=512, # Smaller batch size for 2 vCPU
49
+ n_threads=2, # Match Hugging Face free tier vCPUs
50
+ n_threads_batch=2,
51
+ verbose=False,
52
+ use_mmap=True, # Memory mapping for efficient loading
53
+ use_mlock=False # Disable memory locking for container compatibility
54
+ )
55
+
56
+ SYSTEM_PROMPT = "You are Qwen2.5-Coder, a helpful AI coding assistant. Provide concise, accurate code solutions."
57
+
58
+ def generate_code(message, history):
59
+ # Format with ChatML template
60
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
61
+
62
+ # Add history
63
+ for human, assistant in history:
64
+ messages.append({"role": "user", "content": human})
65
+ messages.append({"role": "assistant", "content": assistant})
66
+
67
+ messages.append({"role": "user", "content": message})
68
+
69
+ # Generate with conservative parameters for free tier
70
+ output = llm.create_chat_completion(
71
+ messages=messages,
72
+ max_tokens=1024, # Limit response length
73
+ temperature=0.7,
74
+ top_p=0.9,
75
+ repeat_penalty=1.1,
76
+ stop=["<|im_end|>", "<|im_start|>"]
77
+ )
78
+
79
+ return output["choices"][0]["message"]["content"]
80
+
81
+ # Create Gradio interface
82
+ demo = gr.ChatInterface(
83
+ fn=generate_code,
84
+ title="Qwen2.5-Coder-7B",
85
+ description="Optimized for Hugging Face Spaces Free Tier (2 vCPU, 16GB RAM)",
86
+ examples=[
87
+ "Write a Python function to reverse a string",
88
+ "Explain how quicksort works with code",
89
+ "Create a simple REST API with Flask"
90
+ ],
91
+ cache_examples=False # Disable to save memory
92
+ )
93
+
94
+ if __name__ == "__main__":
95
+ demo.launch(server_name="0.0.0.0", server_port=7860)
96
+ EOF
97
+
98
+ # Expose Hugging Face Spaces port
99
+ EXPOSE 7860
100
+
101
+ # Health check
102
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
103
+ CMD curl -f http://localhost:7860/ || exit 1
104
+
105
+ # Run the Gradio app
106
+ CMD ["python3", "/app/app.py"]