FROM python:3.10-slim

WORKDIR /app

# Install build dependencies for llama-cpp-python with OpenBLAS
RUN apt-get update && apt-get install -y \
    build-essential \
    cmake \
    curl \
    libopenblas-dev \
    && rm -rf /var/lib/apt/lists/*

# Copy requirements
COPY requirements.txt .

# Install Python dependencies with OpenBLAS for faster CPU inference
ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
RUN pip install --no-cache-dir -r requirements.txt

# Download models - 7B (quality) and 1.5B (speed)
RUN mkdir -p /app/models && \
    echo "Downloading Qwen2.5-Coder-7B (quality model)..." && \
    curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
    "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf" && \
    echo "Downloading Qwen2.5-Coder-1.5B (fast model)..." && \
    curl -L -o /app/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf \
    "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf"

# Copy application code and static files
COPY app.py .
COPY static/ ./static/

# Performance environment variables
ENV N_CTX=4096
ENV N_THREADS=4
ENV N_BATCH=512
ENV USE_MLOCK=true
ENV USE_MMAP=true

# Expose port
EXPOSE 7860

# Run the application
CMD ["python", "app.py"]