Matrix Agent
commited on
Commit
·
1b50d66
1
Parent(s):
8910367
v3.2: Speed optimizations - OpenBLAS, dual models (7B/1.5B), model selector, timing display
Browse files- Dockerfile +17 -4
- app.py +23 -9
- static/index.html +29 -2
Dockerfile
CHANGED
|
@@ -2,28 +2,41 @@ FROM python:3.10-slim
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
-
# Install build dependencies for llama-cpp-python
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
build-essential \
|
| 8 |
cmake \
|
| 9 |
curl \
|
|
|
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
# Copy requirements
|
| 13 |
COPY requirements.txt .
|
| 14 |
|
| 15 |
-
# Install Python dependencies
|
|
|
|
| 16 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
|
| 18 |
-
# Download
|
| 19 |
RUN mkdir -p /app/models && \
|
|
|
|
| 20 |
curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
|
| 21 |
-
"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf"
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# Copy application code and static files
|
| 24 |
COPY app.py .
|
| 25 |
COPY static/ ./static/
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# Expose port
|
| 28 |
EXPOSE 7860
|
| 29 |
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
+
# Install build dependencies for llama-cpp-python with OpenBLAS
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
build-essential \
|
| 8 |
cmake \
|
| 9 |
curl \
|
| 10 |
+
libopenblas-dev \
|
| 11 |
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
|
| 13 |
# Copy requirements
|
| 14 |
COPY requirements.txt .
|
| 15 |
|
| 16 |
+
# Install Python dependencies with OpenBLAS for faster CPU inference
|
| 17 |
+
ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
|
| 18 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
|
| 20 |
+
# Download models - 7B (quality) and 1.5B (speed)
|
| 21 |
RUN mkdir -p /app/models && \
|
| 22 |
+
echo "Downloading Qwen2.5-Coder-7B (quality model)..." && \
|
| 23 |
curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
|
| 24 |
+
"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf" && \
|
| 25 |
+
echo "Downloading Qwen2.5-Coder-1.5B (fast model)..." && \
|
| 26 |
+
curl -L -o /app/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf \
|
| 27 |
+
"https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf"
|
| 28 |
|
| 29 |
# Copy application code and static files
|
| 30 |
COPY app.py .
|
| 31 |
COPY static/ ./static/
|
| 32 |
|
| 33 |
+
# Performance environment variables
|
| 34 |
+
ENV N_CTX=4096
|
| 35 |
+
ENV N_THREADS=4
|
| 36 |
+
ENV N_BATCH=512
|
| 37 |
+
ENV USE_MLOCK=true
|
| 38 |
+
ENV USE_MMAP=true
|
| 39 |
+
|
| 40 |
# Expose port
|
| 41 |
EXPOSE 7860
|
| 42 |
|
app.py
CHANGED
|
@@ -63,28 +63,39 @@ logger.info("=" * 60)
|
|
| 63 |
|
| 64 |
# ============== Configuration ==============
|
| 65 |
MODELS_DIR = "/app/models"
|
| 66 |
-
N_CTX = 8192
|
| 67 |
-
N_THREADS = 2
|
| 68 |
-
N_BATCH = 128
|
| 69 |
|
| 70 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
MODEL_CONFIGS = {
|
| 72 |
"qwen2.5-coder-7b": {
|
| 73 |
"path": f"{MODELS_DIR}/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
|
| 74 |
"url": "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
|
| 75 |
"size": "7B",
|
| 76 |
"quantization": "Q4_K_M",
|
| 77 |
-
"default": True
|
|
|
|
|
|
|
| 78 |
},
|
| 79 |
"qwen2.5-coder-1.5b": {
|
| 80 |
-
"path": f"{MODELS_DIR}/qwen2.5-coder-1.5b-instruct-
|
| 81 |
-
"url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-
|
| 82 |
"size": "1.5B",
|
| 83 |
-
"quantization": "
|
| 84 |
-
"default": False
|
|
|
|
|
|
|
| 85 |
}
|
| 86 |
}
|
| 87 |
|
|
|
|
|
|
|
| 88 |
# ============== Feature 1: Request Queue ==============
|
| 89 |
@dataclass
|
| 90 |
class QueuedRequest:
|
|
@@ -256,6 +267,9 @@ class ModelManager:
|
|
| 256 |
n_ctx=N_CTX,
|
| 257 |
n_threads=N_THREADS,
|
| 258 |
n_batch=N_BATCH,
|
|
|
|
|
|
|
|
|
|
| 259 |
verbose=False
|
| 260 |
)
|
| 261 |
|
|
|
|
| 63 |
|
| 64 |
# ============== Configuration ==============
|
| 65 |
MODELS_DIR = "/app/models"
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
# Performance tuning - optimized for speed
|
| 68 |
+
N_CTX = int(os.environ.get("N_CTX", 4096)) # Reduced for faster processing
|
| 69 |
+
N_THREADS = int(os.environ.get("N_THREADS", 4)) # More threads for parallelism
|
| 70 |
+
N_BATCH = int(os.environ.get("N_BATCH", 512)) # Larger batch for faster prompt processing
|
| 71 |
+
N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", 0)) # GPU acceleration if available
|
| 72 |
+
USE_MLOCK = os.environ.get("USE_MLOCK", "true").lower() == "true" # Lock model in RAM
|
| 73 |
+
USE_MMAP = os.environ.get("USE_MMAP", "true").lower() == "true" # Memory-mapped loading
|
| 74 |
+
|
| 75 |
+
# Model configurations with speed ratings
|
| 76 |
MODEL_CONFIGS = {
|
| 77 |
"qwen2.5-coder-7b": {
|
| 78 |
"path": f"{MODELS_DIR}/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
|
| 79 |
"url": "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
|
| 80 |
"size": "7B",
|
| 81 |
"quantization": "Q4_K_M",
|
| 82 |
+
"default": True,
|
| 83 |
+
"speed": "standard",
|
| 84 |
+
"description": "Best quality, tool use, complex reasoning"
|
| 85 |
},
|
| 86 |
"qwen2.5-coder-1.5b": {
|
| 87 |
+
"path": f"{MODELS_DIR}/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
|
| 88 |
+
"url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
|
| 89 |
"size": "1.5B",
|
| 90 |
+
"quantization": "Q4_K_M",
|
| 91 |
+
"default": False,
|
| 92 |
+
"speed": "fast",
|
| 93 |
+
"description": "3x faster, good for simple tasks"
|
| 94 |
}
|
| 95 |
}
|
| 96 |
|
| 97 |
+
logger.info(f"Performance settings: ctx={N_CTX}, threads={N_THREADS}, batch={N_BATCH}, mlock={USE_MLOCK}")
|
| 98 |
+
|
| 99 |
# ============== Feature 1: Request Queue ==============
|
| 100 |
@dataclass
|
| 101 |
class QueuedRequest:
|
|
|
|
| 267 |
n_ctx=N_CTX,
|
| 268 |
n_threads=N_THREADS,
|
| 269 |
n_batch=N_BATCH,
|
| 270 |
+
n_gpu_layers=N_GPU_LAYERS,
|
| 271 |
+
use_mlock=USE_MLOCK,
|
| 272 |
+
use_mmap=USE_MMAP,
|
| 273 |
verbose=False
|
| 274 |
)
|
| 275 |
|
static/index.html
CHANGED
|
@@ -55,6 +55,7 @@
|
|
| 55 |
<!-- Models Section -->
|
| 56 |
<div class="card rounded-xl p-6 shadow-lg mb-8">
|
| 57 |
<h3 class="text-xl font-bold text-gray-800 mb-4">Available Models</h3>
|
|
|
|
| 58 |
<div id="models-list" class="space-y-3">Loading models...</div>
|
| 59 |
</div>
|
| 60 |
|
|
@@ -121,6 +122,14 @@ print(message.content[0].text)</code></pre>
|
|
| 121 |
<div class="card rounded-xl p-6 shadow-lg mb-8">
|
| 122 |
<h3 class="text-xl font-bold text-gray-800 mb-4">Try it Now</h3>
|
| 123 |
<div class="space-y-4">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
<textarea id="prompt-input" class="w-full p-4 border border-gray-200 rounded-lg focus:ring-2 focus:ring-purple-500 focus:border-transparent" rows="3" placeholder="Enter your prompt here...">Hello! Can you write a simple Python function that calculates factorial?</textarea>
|
| 125 |
<div class="flex gap-4">
|
| 126 |
<button onclick="sendMessage()" class="bg-purple-600 hover:bg-purple-700 text-white font-semibold py-2 px-6 rounded-lg transition">
|
|
@@ -131,6 +140,7 @@ print(message.content[0].text)</code></pre>
|
|
| 131 |
</button>
|
| 132 |
</div>
|
| 133 |
<div id="response-output" class="bg-gray-50 rounded-lg p-4 min-h-[100px] text-sm font-mono whitespace-pre-wrap hidden"></div>
|
|
|
|
| 134 |
</div>
|
| 135 |
</div>
|
| 136 |
|
|
@@ -232,9 +242,13 @@ print(message.content[0].text)</code></pre>
|
|
| 232 |
|
| 233 |
async function sendMessage() {
|
| 234 |
const prompt = document.getElementById('prompt-input').value;
|
|
|
|
| 235 |
const output = document.getElementById('response-output');
|
|
|
|
| 236 |
output.classList.remove('hidden');
|
| 237 |
output.textContent = 'Sending...';
|
|
|
|
|
|
|
| 238 |
|
| 239 |
try {
|
| 240 |
const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
|
|
@@ -245,13 +259,16 @@ print(message.content[0].text)</code></pre>
|
|
| 245 |
'anthropic-version': '2023-06-01'
|
| 246 |
},
|
| 247 |
body: JSON.stringify({
|
| 248 |
-
model:
|
| 249 |
max_tokens: 1024,
|
| 250 |
messages: [{ role: 'user', content: prompt }]
|
| 251 |
})
|
| 252 |
});
|
| 253 |
const data = await res.json();
|
|
|
|
| 254 |
output.textContent = data.content?.[0]?.text || JSON.stringify(data, null, 2);
|
|
|
|
|
|
|
| 255 |
} catch (e) {
|
| 256 |
output.textContent = 'Error: ' + e.message;
|
| 257 |
}
|
|
@@ -259,9 +276,14 @@ print(message.content[0].text)</code></pre>
|
|
| 259 |
|
| 260 |
async function sendStreamingMessage() {
|
| 261 |
const prompt = document.getElementById('prompt-input').value;
|
|
|
|
| 262 |
const output = document.getElementById('response-output');
|
|
|
|
| 263 |
output.classList.remove('hidden');
|
| 264 |
output.textContent = '';
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
try {
|
| 267 |
const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
|
|
@@ -272,7 +294,7 @@ print(message.content[0].text)</code></pre>
|
|
| 272 |
'anthropic-version': '2023-06-01'
|
| 273 |
},
|
| 274 |
body: JSON.stringify({
|
| 275 |
-
model:
|
| 276 |
max_tokens: 1024,
|
| 277 |
stream: true,
|
| 278 |
messages: [{ role: 'user', content: prompt }]
|
|
@@ -295,11 +317,16 @@ print(message.content[0].text)</code></pre>
|
|
| 295 |
const data = JSON.parse(line.slice(6));
|
| 296 |
if (data.delta?.text) {
|
| 297 |
output.textContent += data.delta.text;
|
|
|
|
| 298 |
}
|
| 299 |
} catch {}
|
| 300 |
}
|
| 301 |
}
|
| 302 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
} catch (e) {
|
| 304 |
output.textContent = 'Error: ' + e.message;
|
| 305 |
}
|
|
|
|
| 55 |
<!-- Models Section -->
|
| 56 |
<div class="card rounded-xl p-6 shadow-lg mb-8">
|
| 57 |
<h3 class="text-xl font-bold text-gray-800 mb-4">Available Models</h3>
|
| 58 |
+
<p class="text-sm text-gray-500 mb-4">Choose based on your needs: 7B for quality, 1.5B for speed (3x faster)</p>
|
| 59 |
<div id="models-list" class="space-y-3">Loading models...</div>
|
| 60 |
</div>
|
| 61 |
|
|
|
|
| 122 |
<div class="card rounded-xl p-6 shadow-lg mb-8">
|
| 123 |
<h3 class="text-xl font-bold text-gray-800 mb-4">Try it Now</h3>
|
| 124 |
<div class="space-y-4">
|
| 125 |
+
<div class="flex gap-4 items-center">
|
| 126 |
+
<label class="text-sm font-medium text-gray-700">Model:</label>
|
| 127 |
+
<select id="model-select" class="px-4 py-2 border border-gray-200 rounded-lg focus:ring-2 focus:ring-purple-500">
|
| 128 |
+
<option value="qwen2.5-coder-7b">qwen2.5-coder-7b (Quality)</option>
|
| 129 |
+
<option value="qwen2.5-coder-1.5b">qwen2.5-coder-1.5b (3x Faster)</option>
|
| 130 |
+
</select>
|
| 131 |
+
<span id="speed-indicator" class="text-xs text-gray-500"></span>
|
| 132 |
+
</div>
|
| 133 |
<textarea id="prompt-input" class="w-full p-4 border border-gray-200 rounded-lg focus:ring-2 focus:ring-purple-500 focus:border-transparent" rows="3" placeholder="Enter your prompt here...">Hello! Can you write a simple Python function that calculates factorial?</textarea>
|
| 134 |
<div class="flex gap-4">
|
| 135 |
<button onclick="sendMessage()" class="bg-purple-600 hover:bg-purple-700 text-white font-semibold py-2 px-6 rounded-lg transition">
|
|
|
|
| 140 |
</button>
|
| 141 |
</div>
|
| 142 |
<div id="response-output" class="bg-gray-50 rounded-lg p-4 min-h-[100px] text-sm font-mono whitespace-pre-wrap hidden"></div>
|
| 143 |
+
<div id="timing-info" class="text-xs text-gray-500 hidden"></div>
|
| 144 |
</div>
|
| 145 |
</div>
|
| 146 |
|
|
|
|
| 242 |
|
| 243 |
async function sendMessage() {
|
| 244 |
const prompt = document.getElementById('prompt-input').value;
|
| 245 |
+
const model = document.getElementById('model-select').value;
|
| 246 |
const output = document.getElementById('response-output');
|
| 247 |
+
const timing = document.getElementById('timing-info');
|
| 248 |
output.classList.remove('hidden');
|
| 249 |
output.textContent = 'Sending...';
|
| 250 |
+
timing.classList.add('hidden');
|
| 251 |
+
const startTime = Date.now();
|
| 252 |
|
| 253 |
try {
|
| 254 |
const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
|
|
|
|
| 259 |
'anthropic-version': '2023-06-01'
|
| 260 |
},
|
| 261 |
body: JSON.stringify({
|
| 262 |
+
model: model,
|
| 263 |
max_tokens: 1024,
|
| 264 |
messages: [{ role: 'user', content: prompt }]
|
| 265 |
})
|
| 266 |
});
|
| 267 |
const data = await res.json();
|
| 268 |
+
const elapsed = ((Date.now() - startTime) / 1000).toFixed(2);
|
| 269 |
output.textContent = data.content?.[0]?.text || JSON.stringify(data, null, 2);
|
| 270 |
+
timing.textContent = `Response time: ${elapsed}s | Model: ${model} | Tokens: ${data.usage?.output_tokens || 'N/A'}`;
|
| 271 |
+
timing.classList.remove('hidden');
|
| 272 |
} catch (e) {
|
| 273 |
output.textContent = 'Error: ' + e.message;
|
| 274 |
}
|
|
|
|
| 276 |
|
| 277 |
async function sendStreamingMessage() {
|
| 278 |
const prompt = document.getElementById('prompt-input').value;
|
| 279 |
+
const model = document.getElementById('model-select').value;
|
| 280 |
const output = document.getElementById('response-output');
|
| 281 |
+
const timing = document.getElementById('timing-info');
|
| 282 |
output.classList.remove('hidden');
|
| 283 |
output.textContent = '';
|
| 284 |
+
timing.classList.add('hidden');
|
| 285 |
+
const startTime = Date.now();
|
| 286 |
+
let tokenCount = 0;
|
| 287 |
|
| 288 |
try {
|
| 289 |
const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
|
|
|
|
| 294 |
'anthropic-version': '2023-06-01'
|
| 295 |
},
|
| 296 |
body: JSON.stringify({
|
| 297 |
+
model: model,
|
| 298 |
max_tokens: 1024,
|
| 299 |
stream: true,
|
| 300 |
messages: [{ role: 'user', content: prompt }]
|
|
|
|
| 317 |
const data = JSON.parse(line.slice(6));
|
| 318 |
if (data.delta?.text) {
|
| 319 |
output.textContent += data.delta.text;
|
| 320 |
+
tokenCount++;
|
| 321 |
}
|
| 322 |
} catch {}
|
| 323 |
}
|
| 324 |
}
|
| 325 |
}
|
| 326 |
+
const elapsed = ((Date.now() - startTime) / 1000).toFixed(2);
|
| 327 |
+
const tokensPerSec = (tokenCount / elapsed).toFixed(1);
|
| 328 |
+
timing.textContent = `Time: ${elapsed}s | Model: ${model} | ~${tokenCount} tokens | ${tokensPerSec} tok/s`;
|
| 329 |
+
timing.classList.remove('hidden');
|
| 330 |
} catch (e) {
|
| 331 |
output.textContent = 'Error: ' + e.message;
|
| 332 |
}
|