Dmitry Beresnev commited on
Commit
dde400a
·
1 Parent(s): 8837f11

fix dockerfile and app module

Browse files
Files changed (2) hide show
  1. Dockerfile +22 -14
  2. app.py +167 -47
Dockerfile CHANGED
@@ -9,16 +9,17 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
9
  libcurl4-openssl-dev \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
- # Clone and build llama.cpp
13
  WORKDIR /build
14
  # Cache bust to force fresh build
15
- ARG CACHEBUST=1
16
  RUN git clone https://github.com/ggerganov/llama.cpp.git && \
17
  cd llama.cpp && \
18
  cmake -B build -DCMAKE_BUILD_TYPE=Release \
19
- -DGGML_NATIVE=OFF \
20
- -DGGML_AVX2=OFF \
21
- -DGGML_OPENMP=OFF && \
 
22
  cmake --build build --config Release --target llama-server -j$(nproc) && \
23
  echo "=== Binary dependencies ===" && \
24
  ldd build/bin/llama-server || true
@@ -41,26 +42,33 @@ COPY --from=builder /build/llama.cpp/build/bin/*.so.* /usr/local/lib/
41
  # Update library cache
42
  RUN ldconfig
43
 
 
 
 
 
 
 
 
 
 
44
  # Create non-root user
45
  RUN useradd -m -u 1000 user && \
46
  mkdir -p /home/user/.cache/llama.cpp && \
47
  chown -R user:user /home/user
48
 
 
 
 
49
  USER user
50
  WORKDIR /home/user
51
 
52
  # Set environment variables
53
  ENV HOME=/home/user \
54
  LLAMA_CACHE=/home/user/.cache/llama.cpp \
55
- PATH=/home/user/.local/bin:$PATH
 
56
 
57
  EXPOSE 7860
58
 
59
- # Start llama-server with HuggingFace model
60
- # Using DeepSeek LLM 7B Chat - general purpose model
61
- CMD ["llama-server", \
62
- "-hf", "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf", \
63
- "--host", "0.0.0.0", \
64
- "--port", "7860", \
65
- "-c", "2048", \
66
- "--metrics"]
 
9
  libcurl4-openssl-dev \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
+ # Clone and build llama.cpp with optimizations for speed
13
  WORKDIR /build
14
  # Cache bust to force fresh build
15
+ ARG CACHEBUST=2
16
  RUN git clone https://github.com/ggerganov/llama.cpp.git && \
17
  cd llama.cpp && \
18
  cmake -B build -DCMAKE_BUILD_TYPE=Release \
19
+ -DGGML_NATIVE=ON \
20
+ -DGGML_AVX2=ON \
21
+ -DGGML_FMA=ON \
22
+ -DGGML_F16C=ON && \
23
  cmake --build build --config Release --target llama-server -j$(nproc) && \
24
  echo "=== Binary dependencies ===" && \
25
  ldd build/bin/llama-server || true
 
42
  # Update library cache
43
  RUN ldconfig
44
 
45
+ # Install Python and FastAPI dependencies
46
+ RUN apt-get update && apt-get install -y --no-install-recommends \
47
+ python3 \
48
+ python3-pip \
49
+ && rm -rf /var/lib/apt/lists/*
50
+
51
+ # Install Python packages
52
+ RUN pip3 install --no-cache-dir fastapi uvicorn requests pydantic --break-system-packages
53
+
54
  # Create non-root user
55
  RUN useradd -m -u 1000 user && \
56
  mkdir -p /home/user/.cache/llama.cpp && \
57
  chown -R user:user /home/user
58
 
59
+ # Copy application code
60
+ COPY --chown=user:user app.py /home/user/app.py
61
+
62
  USER user
63
  WORKDIR /home/user
64
 
65
  # Set environment variables
66
  ENV HOME=/home/user \
67
  LLAMA_CACHE=/home/user/.cache/llama.cpp \
68
+ PATH=/home/user/.local/bin:$PATH \
69
+ PYTHONUNBUFFERED=1
70
 
71
  EXPOSE 7860
72
 
73
+ # Start FastAPI app (which manages llama-server internally)
74
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
app.py CHANGED
@@ -1,59 +1,179 @@
1
- from fastapi import FastAPI
2
- from llama_cpp import Llama
3
- from huggingface_hub import hf_hub_download
 
4
  import os
5
-
6
- # GGUF model configuration
7
- REPO_ID = "TheBloke/deepseek-coder-6.7B-instruct-GGUF"
8
- FILENAME = "deepseek-coder-6.7b-instruct.Q4_K_M.gguf"
9
 
10
  app = FastAPI()
11
 
12
- # Download and cache the GGUF model
13
- print(f"Downloading {FILENAME} from {REPO_ID}...")
14
- model_path = hf_hub_download(
15
- repo_id=REPO_ID,
16
- filename=FILENAME,
17
- cache_dir=os.getenv("HF_HOME", "./models")
18
- )
19
- print(f"Model downloaded to: {model_path}")
20
-
21
- # Load the model with llama-cpp-python
22
- print("Loading model into memory...")
23
- llm = Llama(
24
- model_path=model_path,
25
- n_ctx=2048, # Context window
26
- n_threads=4, # CPU threads
27
- n_gpu_layers=0, # Use CPU only (set >0 if GPU available)
28
- verbose=False
29
- )
30
- print("Model loaded successfully!")
31
 
 
 
32
 
33
- @app.post("/v1/chat/completions")
34
- def chat(req: dict):
35
- messages = req.get("messages", [])
36
- max_tokens = req.get("max_tokens", 256)
37
- temperature = req.get("temperature", 0.7)
38
-
39
- # Use llama-cpp-python's built-in chat completion
40
- response = llm.create_chat_completion(
41
- messages=messages,
42
- max_tokens=max_tokens,
43
- temperature=temperature,
44
- stop=["</s>", "User:", "###"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  )
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  return {
48
- "choices": [{
49
- "message": {
50
- "role": "assistant",
51
- "content": response["choices"][0]["message"]["content"]
52
- }
53
- }]
54
  }
55
 
56
 
57
- @app.get("/")
58
- def root():
59
- return {"status": "DeepSeek API is online (GGUF)"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ import subprocess
4
+ import signal
5
  import os
6
+ import requests
7
+ import time
8
+ from typing import Optional
 
9
 
10
  app = FastAPI()
11
 
12
+ # Predefined list of available models
13
+ AVAILABLE_MODELS = {
14
+ # === Financial & Summarization Models (Recommended) ===
15
+ "qwen-2.5-7b": "bartowski/Qwen2.5-7B-Instruct-GGUF:Qwen2.5-7B-Instruct-Q4_K_M.gguf", # Best for financial + multilingual
16
+ "kimi-k2-9b": "bartowski/k2-chat-GGUF:k2-chat-Q4_K_M.gguf", # Kimi K2 - long context, good reasoning
17
+ "yi-1.5-9b": "bartowski/Yi-1.5-9B-Chat-GGUF:Yi-1.5-9B-Chat-Q4_K_M.gguf", # Excellent for finance
18
+ "llama-3.1-8b": "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", # Great reasoning
19
+ "mistral-7b": "TheBloke/Mistral-7B-Instruct-v0.3-GGUF:mistral-7b-instruct-v0.3.Q4_K_M.gguf", # Reliable summarization
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # === Coding Models ===
22
+ "deepseek-coder": "TheBloke/deepseek-coder-6.7B-instruct-GGUF:deepseek-coder-6.7b-instruct.Q4_K_M.gguf",
23
 
24
+ # === General Purpose ===
25
+ "deepseek-chat": "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf",
26
+ "llama-3.2-3b": "bartowski/Llama-3.2-3B-Instruct-GGUF:Llama-3.2-3B-Instruct-Q4_K_M.gguf", # Fast & lightweight
27
+ }
28
+
29
+ # Global state
30
+ current_model = "deepseek-chat" # Default model
31
+ llama_process: Optional[subprocess.Popen] = None
32
+ LLAMA_SERVER_PORT = 8080
33
+ LLAMA_SERVER_URL = f"http://localhost:{LLAMA_SERVER_PORT}"
34
+
35
+
36
+ class ModelSwitchRequest(BaseModel):
37
+ model_name: str
38
+
39
+
40
+ class ChatCompletionRequest(BaseModel):
41
+ messages: list[dict]
42
+ max_tokens: int = 256
43
+ temperature: float = 0.7
44
+
45
+
46
+ def start_llama_server(model_id: str) -> subprocess.Popen:
47
+ """Start llama-server with specified model (optimized for speed)."""
48
+ cmd = [
49
+ "llama-server",
50
+ "-hf", model_id,
51
+ "--host", "0.0.0.0",
52
+ "--port", str(LLAMA_SERVER_PORT),
53
+ "-c", "2048", # Context size
54
+ "-t", "4", # CPU threads (adjust based on cores)
55
+ "-ngl", "0", # GPU layers (0 for CPU-only)
56
+ "--cont-batching", # Enable continuous batching for speed
57
+ "-b", "512", # Batch size
58
+ ]
59
+
60
+ print(f"Starting llama-server with model: {model_id}")
61
+ process = subprocess.Popen(
62
+ cmd,
63
+ stdout=subprocess.PIPE,
64
+ stderr=subprocess.PIPE,
65
+ preexec_fn=os.setsid if os.name != 'nt' else None
66
  )
67
 
68
+ # Wait for server to be ready
69
+ max_retries = 60
70
+ for i in range(max_retries):
71
+ try:
72
+ response = requests.get(f"{LLAMA_SERVER_URL}/health", timeout=1)
73
+ if response.status_code == 200:
74
+ print(f"llama-server ready after {i+1} seconds")
75
+ return process
76
+ except:
77
+ time.sleep(1)
78
+
79
+ raise RuntimeError("llama-server failed to start")
80
+
81
+
82
+ def stop_llama_server():
83
+ """Stop the running llama-server."""
84
+ global llama_process
85
+ if llama_process:
86
+ print("Stopping llama-server...")
87
+ try:
88
+ if os.name != 'nt':
89
+ os.killpg(os.getpgid(llama_process.pid), signal.SIGTERM)
90
+ else:
91
+ llama_process.terminate()
92
+ llama_process.wait(timeout=10)
93
+ except:
94
+ if os.name != 'nt':
95
+ os.killpg(os.getpgid(llama_process.pid), signal.SIGKILL)
96
+ else:
97
+ llama_process.kill()
98
+ llama_process = None
99
+ time.sleep(2) # Give it time to fully shut down
100
+
101
+
102
+ @app.on_event("startup")
103
+ async def startup_event():
104
+ """Start with default model."""
105
+ global llama_process
106
+ model_id = AVAILABLE_MODELS[current_model]
107
+ llama_process = start_llama_server(model_id)
108
+
109
+
110
+ @app.on_event("shutdown")
111
+ async def shutdown_event():
112
+ """Clean shutdown."""
113
+ stop_llama_server()
114
+
115
+
116
+ @app.get("/")
117
+ async def root():
118
  return {
119
+ "status": "DeepSeek API with dynamic model switching",
120
+ "current_model": current_model,
121
+ "available_models": list(AVAILABLE_MODELS.keys())
 
 
 
122
  }
123
 
124
 
125
+ @app.get("/models")
126
+ async def list_models():
127
+ """List all available models."""
128
+ return {
129
+ "current_model": current_model,
130
+ "available_models": list(AVAILABLE_MODELS.keys())
131
+ }
132
+
133
+
134
+ @app.post("/switch-model")
135
+ async def switch_model(request: ModelSwitchRequest):
136
+ """Switch to a different model."""
137
+ global current_model, llama_process
138
+
139
+ if request.model_name not in AVAILABLE_MODELS:
140
+ raise HTTPException(
141
+ status_code=400,
142
+ detail=f"Model '{request.model_name}' not found. Available: {list(AVAILABLE_MODELS.keys())}"
143
+ )
144
+
145
+ if request.model_name == current_model:
146
+ return {"message": f"Already using model: {current_model}"}
147
+
148
+ # Stop current server
149
+ stop_llama_server()
150
+
151
+ # Start with new model
152
+ model_id = AVAILABLE_MODELS[request.model_name]
153
+ llama_process = start_llama_server(model_id)
154
+ current_model = request.model_name
155
+
156
+ return {
157
+ "message": f"Switched to model: {current_model}",
158
+ "model": current_model
159
+ }
160
+
161
+
162
+ @app.post("/v1/chat/completions")
163
+ async def chat_completions(request: ChatCompletionRequest):
164
+ """OpenAI-compatible chat completions endpoint."""
165
+ try:
166
+ # Forward to llama-server
167
+ response = requests.post(
168
+ f"{LLAMA_SERVER_URL}/v1/chat/completions",
169
+ json={
170
+ "messages": request.messages,
171
+ "max_tokens": request.max_tokens,
172
+ "temperature": request.temperature,
173
+ },
174
+ timeout=300
175
+ )
176
+ response.raise_for_status()
177
+ return response.json()
178
+ except requests.exceptions.RequestException as e:
179
+ raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")