Matrix Agent commited on
Commit
1b50d66
·
1 Parent(s): 8910367

v3.2: Speed optimizations - OpenBLAS, dual models (7B/1.5B), model selector, timing display

Browse files
Files changed (3) hide show
  1. Dockerfile +17 -4
  2. app.py +23 -9
  3. static/index.html +29 -2
Dockerfile CHANGED
@@ -2,28 +2,41 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install build dependencies for llama-cpp-python
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
8
  cmake \
9
  curl \
 
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
  # Copy requirements
13
  COPY requirements.txt .
14
 
15
- # Install Python dependencies (llama-cpp-python compiles from source)
 
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
18
- # Download Qwen2.5-Coder-7B-Instruct Q4_K_M GGUF
19
  RUN mkdir -p /app/models && \
 
20
  curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
21
- "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf"
 
 
 
22
 
23
  # Copy application code and static files
24
  COPY app.py .
25
  COPY static/ ./static/
26
 
 
 
 
 
 
 
 
27
  # Expose port
28
  EXPOSE 7860
29
 
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install build dependencies for llama-cpp-python with OpenBLAS
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
8
  cmake \
9
  curl \
10
+ libopenblas-dev \
11
  && rm -rf /var/lib/apt/lists/*
12
 
13
  # Copy requirements
14
  COPY requirements.txt .
15
 
16
+ # Install Python dependencies with OpenBLAS for faster CPU inference
17
+ ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
18
  RUN pip install --no-cache-dir -r requirements.txt
19
 
20
+ # Download models - 7B (quality) and 1.5B (speed)
21
  RUN mkdir -p /app/models && \
22
+ echo "Downloading Qwen2.5-Coder-7B (quality model)..." && \
23
  curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
24
+ "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf" && \
25
+ echo "Downloading Qwen2.5-Coder-1.5B (fast model)..." && \
26
+ curl -L -o /app/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf \
27
+ "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf"
28
 
29
  # Copy application code and static files
30
  COPY app.py .
31
  COPY static/ ./static/
32
 
33
+ # Performance environment variables
34
+ ENV N_CTX=4096
35
+ ENV N_THREADS=4
36
+ ENV N_BATCH=512
37
+ ENV USE_MLOCK=true
38
+ ENV USE_MMAP=true
39
+
40
  # Expose port
41
  EXPOSE 7860
42
 
app.py CHANGED
@@ -63,28 +63,39 @@ logger.info("=" * 60)
63
 
64
  # ============== Configuration ==============
65
  MODELS_DIR = "/app/models"
66
- N_CTX = 8192
67
- N_THREADS = 2
68
- N_BATCH = 128
69
 
70
- # Model configurations
 
 
 
 
 
 
 
 
71
  MODEL_CONFIGS = {
72
  "qwen2.5-coder-7b": {
73
  "path": f"{MODELS_DIR}/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
74
  "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
75
  "size": "7B",
76
  "quantization": "Q4_K_M",
77
- "default": True
 
 
78
  },
79
  "qwen2.5-coder-1.5b": {
80
- "path": f"{MODELS_DIR}/qwen2.5-coder-1.5b-instruct-q8_0.gguf",
81
- "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q8_0.gguf",
82
  "size": "1.5B",
83
- "quantization": "Q8_0",
84
- "default": False
 
 
85
  }
86
  }
87
 
 
 
88
  # ============== Feature 1: Request Queue ==============
89
  @dataclass
90
  class QueuedRequest:
@@ -256,6 +267,9 @@ class ModelManager:
256
  n_ctx=N_CTX,
257
  n_threads=N_THREADS,
258
  n_batch=N_BATCH,
 
 
 
259
  verbose=False
260
  )
261
 
 
63
 
64
  # ============== Configuration ==============
65
  MODELS_DIR = "/app/models"
 
 
 
66
 
67
+ # Performance tuning - optimized for speed
68
+ N_CTX = int(os.environ.get("N_CTX", 4096)) # Reduced for faster processing
69
+ N_THREADS = int(os.environ.get("N_THREADS", 4)) # More threads for parallelism
70
+ N_BATCH = int(os.environ.get("N_BATCH", 512)) # Larger batch for faster prompt processing
71
+ N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", 0)) # GPU acceleration if available
72
+ USE_MLOCK = os.environ.get("USE_MLOCK", "true").lower() == "true" # Lock model in RAM
73
+ USE_MMAP = os.environ.get("USE_MMAP", "true").lower() == "true" # Memory-mapped loading
74
+
75
+ # Model configurations with speed ratings
76
  MODEL_CONFIGS = {
77
  "qwen2.5-coder-7b": {
78
  "path": f"{MODELS_DIR}/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
79
  "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
80
  "size": "7B",
81
  "quantization": "Q4_K_M",
82
+ "default": True,
83
+ "speed": "standard",
84
+ "description": "Best quality, tool use, complex reasoning"
85
  },
86
  "qwen2.5-coder-1.5b": {
87
+ "path": f"{MODELS_DIR}/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
88
+ "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
89
  "size": "1.5B",
90
+ "quantization": "Q4_K_M",
91
+ "default": False,
92
+ "speed": "fast",
93
+ "description": "3x faster, good for simple tasks"
94
  }
95
  }
96
 
97
+ logger.info(f"Performance settings: ctx={N_CTX}, threads={N_THREADS}, batch={N_BATCH}, mlock={USE_MLOCK}")
98
+
99
  # ============== Feature 1: Request Queue ==============
100
  @dataclass
101
  class QueuedRequest:
 
267
  n_ctx=N_CTX,
268
  n_threads=N_THREADS,
269
  n_batch=N_BATCH,
270
+ n_gpu_layers=N_GPU_LAYERS,
271
+ use_mlock=USE_MLOCK,
272
+ use_mmap=USE_MMAP,
273
  verbose=False
274
  )
275
 
static/index.html CHANGED
@@ -55,6 +55,7 @@
55
  <!-- Models Section -->
56
  <div class="card rounded-xl p-6 shadow-lg mb-8">
57
  <h3 class="text-xl font-bold text-gray-800 mb-4">Available Models</h3>
 
58
  <div id="models-list" class="space-y-3">Loading models...</div>
59
  </div>
60
 
@@ -121,6 +122,14 @@ print(message.content[0].text)</code></pre>
121
  <div class="card rounded-xl p-6 shadow-lg mb-8">
122
  <h3 class="text-xl font-bold text-gray-800 mb-4">Try it Now</h3>
123
  <div class="space-y-4">
 
 
 
 
 
 
 
 
124
  <textarea id="prompt-input" class="w-full p-4 border border-gray-200 rounded-lg focus:ring-2 focus:ring-purple-500 focus:border-transparent" rows="3" placeholder="Enter your prompt here...">Hello! Can you write a simple Python function that calculates factorial?</textarea>
125
  <div class="flex gap-4">
126
  <button onclick="sendMessage()" class="bg-purple-600 hover:bg-purple-700 text-white font-semibold py-2 px-6 rounded-lg transition">
@@ -131,6 +140,7 @@ print(message.content[0].text)</code></pre>
131
  </button>
132
  </div>
133
  <div id="response-output" class="bg-gray-50 rounded-lg p-4 min-h-[100px] text-sm font-mono whitespace-pre-wrap hidden"></div>
 
134
  </div>
135
  </div>
136
 
@@ -232,9 +242,13 @@ print(message.content[0].text)</code></pre>
232
 
233
  async function sendMessage() {
234
  const prompt = document.getElementById('prompt-input').value;
 
235
  const output = document.getElementById('response-output');
 
236
  output.classList.remove('hidden');
237
  output.textContent = 'Sending...';
 
 
238
 
239
  try {
240
  const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
@@ -245,13 +259,16 @@ print(message.content[0].text)</code></pre>
245
  'anthropic-version': '2023-06-01'
246
  },
247
  body: JSON.stringify({
248
- model: 'qwen2.5-coder-7b',
249
  max_tokens: 1024,
250
  messages: [{ role: 'user', content: prompt }]
251
  })
252
  });
253
  const data = await res.json();
 
254
  output.textContent = data.content?.[0]?.text || JSON.stringify(data, null, 2);
 
 
255
  } catch (e) {
256
  output.textContent = 'Error: ' + e.message;
257
  }
@@ -259,9 +276,14 @@ print(message.content[0].text)</code></pre>
259
 
260
  async function sendStreamingMessage() {
261
  const prompt = document.getElementById('prompt-input').value;
 
262
  const output = document.getElementById('response-output');
 
263
  output.classList.remove('hidden');
264
  output.textContent = '';
 
 
 
265
 
266
  try {
267
  const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
@@ -272,7 +294,7 @@ print(message.content[0].text)</code></pre>
272
  'anthropic-version': '2023-06-01'
273
  },
274
  body: JSON.stringify({
275
- model: 'qwen2.5-coder-7b',
276
  max_tokens: 1024,
277
  stream: true,
278
  messages: [{ role: 'user', content: prompt }]
@@ -295,11 +317,16 @@ print(message.content[0].text)</code></pre>
295
  const data = JSON.parse(line.slice(6));
296
  if (data.delta?.text) {
297
  output.textContent += data.delta.text;
 
298
  }
299
  } catch {}
300
  }
301
  }
302
  }
 
 
 
 
303
  } catch (e) {
304
  output.textContent = 'Error: ' + e.message;
305
  }
 
55
  <!-- Models Section -->
56
  <div class="card rounded-xl p-6 shadow-lg mb-8">
57
  <h3 class="text-xl font-bold text-gray-800 mb-4">Available Models</h3>
58
+ <p class="text-sm text-gray-500 mb-4">Choose based on your needs: 7B for quality, 1.5B for speed (3x faster)</p>
59
  <div id="models-list" class="space-y-3">Loading models...</div>
60
  </div>
61
 
 
122
  <div class="card rounded-xl p-6 shadow-lg mb-8">
123
  <h3 class="text-xl font-bold text-gray-800 mb-4">Try it Now</h3>
124
  <div class="space-y-4">
125
+ <div class="flex gap-4 items-center">
126
+ <label class="text-sm font-medium text-gray-700">Model:</label>
127
+ <select id="model-select" class="px-4 py-2 border border-gray-200 rounded-lg focus:ring-2 focus:ring-purple-500">
128
+ <option value="qwen2.5-coder-7b">qwen2.5-coder-7b (Quality)</option>
129
+ <option value="qwen2.5-coder-1.5b">qwen2.5-coder-1.5b (3x Faster)</option>
130
+ </select>
131
+ <span id="speed-indicator" class="text-xs text-gray-500"></span>
132
+ </div>
133
  <textarea id="prompt-input" class="w-full p-4 border border-gray-200 rounded-lg focus:ring-2 focus:ring-purple-500 focus:border-transparent" rows="3" placeholder="Enter your prompt here...">Hello! Can you write a simple Python function that calculates factorial?</textarea>
134
  <div class="flex gap-4">
135
  <button onclick="sendMessage()" class="bg-purple-600 hover:bg-purple-700 text-white font-semibold py-2 px-6 rounded-lg transition">
 
140
  </button>
141
  </div>
142
  <div id="response-output" class="bg-gray-50 rounded-lg p-4 min-h-[100px] text-sm font-mono whitespace-pre-wrap hidden"></div>
143
+ <div id="timing-info" class="text-xs text-gray-500 hidden"></div>
144
  </div>
145
  </div>
146
 
 
242
 
243
  async function sendMessage() {
244
  const prompt = document.getElementById('prompt-input').value;
245
+ const model = document.getElementById('model-select').value;
246
  const output = document.getElementById('response-output');
247
+ const timing = document.getElementById('timing-info');
248
  output.classList.remove('hidden');
249
  output.textContent = 'Sending...';
250
+ timing.classList.add('hidden');
251
+ const startTime = Date.now();
252
 
253
  try {
254
  const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
 
259
  'anthropic-version': '2023-06-01'
260
  },
261
  body: JSON.stringify({
262
+ model: model,
263
  max_tokens: 1024,
264
  messages: [{ role: 'user', content: prompt }]
265
  })
266
  });
267
  const data = await res.json();
268
+ const elapsed = ((Date.now() - startTime) / 1000).toFixed(2);
269
  output.textContent = data.content?.[0]?.text || JSON.stringify(data, null, 2);
270
+ timing.textContent = `Response time: ${elapsed}s | Model: ${model} | Tokens: ${data.usage?.output_tokens || 'N/A'}`;
271
+ timing.classList.remove('hidden');
272
  } catch (e) {
273
  output.textContent = 'Error: ' + e.message;
274
  }
 
276
 
277
  async function sendStreamingMessage() {
278
  const prompt = document.getElementById('prompt-input').value;
279
+ const model = document.getElementById('model-select').value;
280
  const output = document.getElementById('response-output');
281
+ const timing = document.getElementById('timing-info');
282
  output.classList.remove('hidden');
283
  output.textContent = '';
284
+ timing.classList.add('hidden');
285
+ const startTime = Date.now();
286
+ let tokenCount = 0;
287
 
288
  try {
289
  const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
 
294
  'anthropic-version': '2023-06-01'
295
  },
296
  body: JSON.stringify({
297
+ model: model,
298
  max_tokens: 1024,
299
  stream: true,
300
  messages: [{ role: 'user', content: prompt }]
 
317
  const data = JSON.parse(line.slice(6));
318
  if (data.delta?.text) {
319
  output.textContent += data.delta.text;
320
+ tokenCount++;
321
  }
322
  } catch {}
323
  }
324
  }
325
  }
326
+ const elapsed = ((Date.now() - startTime) / 1000).toFixed(2);
327
+ const tokensPerSec = (tokenCount / elapsed).toFixed(1);
328
+ timing.textContent = `Time: ${elapsed}s | Model: ${model} | ~${tokenCount} tokens | ${tokensPerSec} tok/s`;
329
+ timing.classList.remove('hidden');
330
  } catch (e) {
331
  output.textContent = 'Error: ' + e.message;
332
  }