Matrix Agent
v3.2: Speed optimizations - OpenBLAS, dual models (7B/1.5B), model selector, timing display
1b50d66
FROM python:3.10-slim
WORKDIR /app
# Install build dependencies for llama-cpp-python with OpenBLAS
RUN apt-get update && apt-get install -y \
build-essential \
cmake \
curl \
libopenblas-dev \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements
COPY requirements.txt .
# Install Python dependencies with OpenBLAS for faster CPU inference
ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
RUN pip install --no-cache-dir -r requirements.txt
# Download models - 7B (quality) and 1.5B (speed)
RUN mkdir -p /app/models && \
echo "Downloading Qwen2.5-Coder-7B (quality model)..." && \
curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf" && \
echo "Downloading Qwen2.5-Coder-1.5B (fast model)..." && \
curl -L -o /app/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf \
"https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf"
# Copy application code and static files
COPY app.py .
COPY static/ ./static/
# Performance environment variables
ENV N_CTX=4096
ENV N_THREADS=4
ENV N_BATCH=512
ENV USE_MLOCK=true
ENV USE_MMAP=true
# Expose port
EXPOSE 7860
# Run the application
CMD ["python", "app.py"]