Matrix Agent
v3.2: Speed optimizations - OpenBLAS, dual models (7B/1.5B), model selector, timing display
1b50d66
| FROM python:3.10-slim | |
| WORKDIR /app | |
| # Install build dependencies for llama-cpp-python with OpenBLAS | |
| RUN apt-get update && apt-get install -y \ | |
| build-essential \ | |
| cmake \ | |
| curl \ | |
| libopenblas-dev \ | |
| && rm -rf /var/lib/apt/lists/* | |
| # Copy requirements | |
| COPY requirements.txt . | |
| # Install Python dependencies with OpenBLAS for faster CPU inference | |
| ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" | |
| RUN pip install --no-cache-dir -r requirements.txt | |
| # Download models - 7B (quality) and 1.5B (speed) | |
| RUN mkdir -p /app/models && \ | |
| echo "Downloading Qwen2.5-Coder-7B (quality model)..." && \ | |
| curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \ | |
| "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf" && \ | |
| echo "Downloading Qwen2.5-Coder-1.5B (fast model)..." && \ | |
| curl -L -o /app/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf \ | |
| "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf" | |
| # Copy application code and static files | |
| COPY app.py . | |
| COPY static/ ./static/ | |
| # Performance environment variables | |
| ENV N_CTX=4096 | |
| ENV N_THREADS=4 | |
| ENV N_BATCH=512 | |
| ENV USE_MLOCK=true | |
| ENV USE_MMAP=true | |
| # Expose port | |
| EXPOSE 7860 | |
| # Run the application | |
| CMD ["python", "app.py"] | |