FROM python:3.10-slim WORKDIR /app # Install build dependencies for llama-cpp-python with OpenBLAS RUN apt-get update && apt-get install -y \ build-essential \ cmake \ curl \ libopenblas-dev \ && rm -rf /var/lib/apt/lists/* # Copy requirements COPY requirements.txt . # Install Python dependencies with OpenBLAS for faster CPU inference ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" RUN pip install --no-cache-dir -r requirements.txt # Download models - 7B (quality) and 1.5B (speed) RUN mkdir -p /app/models && \ echo "Downloading Qwen2.5-Coder-7B (quality model)..." && \ curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \ "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf" && \ echo "Downloading Qwen2.5-Coder-1.5B (fast model)..." && \ curl -L -o /app/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf \ "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf" # Copy application code and static files COPY app.py . COPY static/ ./static/ # Performance environment variables ENV N_CTX=4096 ENV N_THREADS=4 ENV N_BATCH=512 ENV USE_MLOCK=true ENV USE_MMAP=true # Expose port EXPOSE 7860 # Run the application CMD ["python", "app.py"]