Spaces:

likhonsheikh
/

anthropic-compatible-api

Running

App Files Files Community

Matrix Agent commited on Dec 10, 2025

Commit

1b50d66

1 Parent(s): 8910367

v3.2: Speed optimizations - OpenBLAS, dual models (7B/1.5B), model selector, timing display

Browse files

Files changed (3) hide show

Dockerfile +17 -4
app.py +23 -9
static/index.html +29 -2

Dockerfile CHANGED Viewed

@@ -2,28 +2,41 @@ FROM python:3.10-slim
 WORKDIR /app
-# Install build dependencies for llama-cpp-python
 RUN apt-get update && apt-get install -y \
     build-essential \
     cmake \
     curl \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements
 COPY requirements.txt .
-# Install Python dependencies (llama-cpp-python compiles from source)
 RUN pip install --no-cache-dir -r requirements.txt
-# Download Qwen2.5-Coder-7B-Instruct Q4_K_M GGUF
 RUN mkdir -p /app/models && \
     curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
-    "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf"
 # Copy application code and static files
 COPY app.py .
 COPY static/ ./static/
 # Expose port
 EXPOSE 7860

 WORKDIR /app
+# Install build dependencies for llama-cpp-python with OpenBLAS
 RUN apt-get update && apt-get install -y \
     build-essential \
     cmake \
     curl \
+    libopenblas-dev \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements
 COPY requirements.txt .
+# Install Python dependencies with OpenBLAS for faster CPU inference
+ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
 RUN pip install --no-cache-dir -r requirements.txt
+# Download models - 7B (quality) and 1.5B (speed)
 RUN mkdir -p /app/models && \
+    echo "Downloading Qwen2.5-Coder-7B (quality model)..." && \
     curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
+    "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf" && \
+    echo "Downloading Qwen2.5-Coder-1.5B (fast model)..." && \
+    curl -L -o /app/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf \
+    "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf"
 # Copy application code and static files
 COPY app.py .
 COPY static/ ./static/
+# Performance environment variables
+ENV N_CTX=4096
+ENV N_THREADS=4
+ENV N_BATCH=512
+ENV USE_MLOCK=true
+ENV USE_MMAP=true
 # Expose port
 EXPOSE 7860

app.py CHANGED Viewed

@@ -63,28 +63,39 @@ logger.info("=" * 60)
 # ============== Configuration ==============
 MODELS_DIR = "/app/models"
-N_CTX = 8192
-N_THREADS = 2
-N_BATCH = 128
-# Model configurations
 MODEL_CONFIGS = {
     "qwen2.5-coder-7b": {
         "path": f"{MODELS_DIR}/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
         "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
         "size": "7B",
         "quantization": "Q4_K_M",
-        "default": True
     },
     "qwen2.5-coder-1.5b": {
-        "path": f"{MODELS_DIR}/qwen2.5-coder-1.5b-instruct-q8_0.gguf",
-        "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q8_0.gguf",
         "size": "1.5B",
-        "quantization": "Q8_0",
-        "default": False
     }
 }
 # ============== Feature 1: Request Queue ==============
 @dataclass
 class QueuedRequest:
@@ -256,6 +267,9 @@ class ModelManager:
                     n_ctx=N_CTX,
                     n_threads=N_THREADS,
                     n_batch=N_BATCH,
                     verbose=False
                 )

 # ============== Configuration ==============
 MODELS_DIR = "/app/models"
+# Performance tuning - optimized for speed
+N_CTX = int(os.environ.get("N_CTX", 4096))  # Reduced for faster processing
+N_THREADS = int(os.environ.get("N_THREADS", 4))  # More threads for parallelism
+N_BATCH = int(os.environ.get("N_BATCH", 512))  # Larger batch for faster prompt processing
+N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", 0))  # GPU acceleration if available
+USE_MLOCK = os.environ.get("USE_MLOCK", "true").lower() == "true"  # Lock model in RAM
+USE_MMAP = os.environ.get("USE_MMAP", "true").lower() == "true"  # Memory-mapped loading
+# Model configurations with speed ratings
 MODEL_CONFIGS = {
     "qwen2.5-coder-7b": {
         "path": f"{MODELS_DIR}/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
         "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
         "size": "7B",
         "quantization": "Q4_K_M",
+        "default": True,
+        "speed": "standard",
+        "description": "Best quality, tool use, complex reasoning"
     },
     "qwen2.5-coder-1.5b": {
+        "path": f"{MODELS_DIR}/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
+        "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
         "size": "1.5B",
+        "quantization": "Q4_K_M",
+        "default": False,
+        "speed": "fast",
+        "description": "3x faster, good for simple tasks"
     }
 }
+logger.info(f"Performance settings: ctx={N_CTX}, threads={N_THREADS}, batch={N_BATCH}, mlock={USE_MLOCK}")
 # ============== Feature 1: Request Queue ==============
 @dataclass
 class QueuedRequest:
                     n_ctx=N_CTX,
                     n_threads=N_THREADS,
                     n_batch=N_BATCH,
+                    n_gpu_layers=N_GPU_LAYERS,
+                    use_mlock=USE_MLOCK,
+                    use_mmap=USE_MMAP,
                     verbose=False
                 )

static/index.html CHANGED Viewed

@@ -55,6 +55,7 @@
         <!-- Models Section -->
         <div class="card rounded-xl p-6 shadow-lg mb-8">
             <h3 class="text-xl font-bold text-gray-800 mb-4">Available Models</h3>
             <div id="models-list" class="space-y-3">Loading models...</div>
         </div>
@@ -121,6 +122,14 @@ print(message.content[0].text)</code></pre>
         <div class="card rounded-xl p-6 shadow-lg mb-8">
             <h3 class="text-xl font-bold text-gray-800 mb-4">Try it Now</h3>
             <div class="space-y-4">
                 <textarea id="prompt-input" class="w-full p-4 border border-gray-200 rounded-lg focus:ring-2 focus:ring-purple-500 focus:border-transparent" rows="3" placeholder="Enter your prompt here...">Hello! Can you write a simple Python function that calculates factorial?</textarea>
                 <div class="flex gap-4">
                     <button onclick="sendMessage()" class="bg-purple-600 hover:bg-purple-700 text-white font-semibold py-2 px-6 rounded-lg transition">
@@ -131,6 +140,7 @@ print(message.content[0].text)</code></pre>
                     </button>
                 </div>
                 <div id="response-output" class="bg-gray-50 rounded-lg p-4 min-h-[100px] text-sm font-mono whitespace-pre-wrap hidden"></div>
             </div>
         </div>
@@ -232,9 +242,13 @@ print(message.content[0].text)</code></pre>
         async function sendMessage() {
             const prompt = document.getElementById('prompt-input').value;
             const output = document.getElementById('response-output');
             output.classList.remove('hidden');
             output.textContent = 'Sending...';
             try {
                 const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
@@ -245,13 +259,16 @@ print(message.content[0].text)</code></pre>
                         'anthropic-version': '2023-06-01'
                     },
                     body: JSON.stringify({
-                        model: 'qwen2.5-coder-7b',
                         max_tokens: 1024,
                         messages: [{ role: 'user', content: prompt }]
                     })
                 });
                 const data = await res.json();
                 output.textContent = data.content?.[0]?.text || JSON.stringify(data, null, 2);
             } catch (e) {
                 output.textContent = 'Error: ' + e.message;
             }
@@ -259,9 +276,14 @@ print(message.content[0].text)</code></pre>
         async function sendStreamingMessage() {
             const prompt = document.getElementById('prompt-input').value;
             const output = document.getElementById('response-output');
             output.classList.remove('hidden');
             output.textContent = '';
             try {
                 const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
@@ -272,7 +294,7 @@ print(message.content[0].text)</code></pre>
                         'anthropic-version': '2023-06-01'
                     },
                     body: JSON.stringify({
-                        model: 'qwen2.5-coder-7b',
                         max_tokens: 1024,
                         stream: true,
                         messages: [{ role: 'user', content: prompt }]
@@ -295,11 +317,16 @@ print(message.content[0].text)</code></pre>
                                 const data = JSON.parse(line.slice(6));
                                 if (data.delta?.text) {
                                     output.textContent += data.delta.text;
                                 }
                             } catch {}
                         }
                     }
                 }
             } catch (e) {
                 output.textContent = 'Error: ' + e.message;
             }

         <!-- Models Section -->
         <div class="card rounded-xl p-6 shadow-lg mb-8">
             <h3 class="text-xl font-bold text-gray-800 mb-4">Available Models</h3>
+            <p class="text-sm text-gray-500 mb-4">Choose based on your needs: 7B for quality, 1.5B for speed (3x faster)</p>
             <div id="models-list" class="space-y-3">Loading models...</div>
         </div>
         <div class="card rounded-xl p-6 shadow-lg mb-8">
             <h3 class="text-xl font-bold text-gray-800 mb-4">Try it Now</h3>
             <div class="space-y-4">
+                <div class="flex gap-4 items-center">
+                    <label class="text-sm font-medium text-gray-700">Model:</label>
+                    <select id="model-select" class="px-4 py-2 border border-gray-200 rounded-lg focus:ring-2 focus:ring-purple-500">
+                        <option value="qwen2.5-coder-7b">qwen2.5-coder-7b (Quality)</option>
+                        <option value="qwen2.5-coder-1.5b">qwen2.5-coder-1.5b (3x Faster)</option>
+                    </select>
+                    <span id="speed-indicator" class="text-xs text-gray-500"></span>
+                </div>
                 <textarea id="prompt-input" class="w-full p-4 border border-gray-200 rounded-lg focus:ring-2 focus:ring-purple-500 focus:border-transparent" rows="3" placeholder="Enter your prompt here...">Hello! Can you write a simple Python function that calculates factorial?</textarea>
                 <div class="flex gap-4">
                     <button onclick="sendMessage()" class="bg-purple-600 hover:bg-purple-700 text-white font-semibold py-2 px-6 rounded-lg transition">
                     </button>
                 </div>
                 <div id="response-output" class="bg-gray-50 rounded-lg p-4 min-h-[100px] text-sm font-mono whitespace-pre-wrap hidden"></div>
+                <div id="timing-info" class="text-xs text-gray-500 hidden"></div>
             </div>
         </div>
         async function sendMessage() {
             const prompt = document.getElementById('prompt-input').value;
+            const model = document.getElementById('model-select').value;
             const output = document.getElementById('response-output');
+            const timing = document.getElementById('timing-info');
             output.classList.remove('hidden');
             output.textContent = 'Sending...';
+            timing.classList.add('hidden');
+            const startTime = Date.now();
             try {
                 const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
                         'anthropic-version': '2023-06-01'
                     },
                     body: JSON.stringify({
+                        model: model,
                         max_tokens: 1024,
                         messages: [{ role: 'user', content: prompt }]
                     })
                 });
                 const data = await res.json();
+                const elapsed = ((Date.now() - startTime) / 1000).toFixed(2);
                 output.textContent = data.content?.[0]?.text || JSON.stringify(data, null, 2);
+                timing.textContent = `Response time: ${elapsed}s | Model: ${model} | Tokens: ${data.usage?.output_tokens || 'N/A'}`;
+                timing.classList.remove('hidden');
             } catch (e) {
                 output.textContent = 'Error: ' + e.message;
             }
         async function sendStreamingMessage() {
             const prompt = document.getElementById('prompt-input').value;
+            const model = document.getElementById('model-select').value;
             const output = document.getElementById('response-output');
+            const timing = document.getElementById('timing-info');
             output.classList.remove('hidden');
             output.textContent = '';
+            timing.classList.add('hidden');
+            const startTime = Date.now();
+            let tokenCount = 0;
             try {
                 const res = await fetch(BASE_URL + '/anthropic/v1/messages', {
                         'anthropic-version': '2023-06-01'
                     },
                     body: JSON.stringify({
+                        model: model,
                         max_tokens: 1024,
                         stream: true,
                         messages: [{ role: 'user', content: prompt }]
                                 const data = JSON.parse(line.slice(6));
                                 if (data.delta?.text) {
                                     output.textContent += data.delta.text;
+                                    tokenCount++;
                                 }
                             } catch {}
                         }
                     }
                 }
+                const elapsed = ((Date.now() - startTime) / 1000).toFixed(2);
+                const tokensPerSec = (tokenCount / elapsed).toFixed(1);
+                timing.textContent = `Time: ${elapsed}s | Model: ${model} | ~${tokenCount} tokens | ${tokensPerSec} tok/s`;
+                timing.classList.remove('hidden');
             } catch (e) {
                 output.textContent = 'Error: ' + e.message;
             }