# GPU-enabled Dockerfile (works on both GPU and CPU hardware)
# Uses NVIDIA CUDA base image for optimal performance on GPU
# Falls back gracefully to CPU if GPU not available
FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04

WORKDIR /app

ENV MODEL_DIR=/app/pretrain_model
ENV HF_HUB_DISABLE_SYMLINKS_WARNING=1
ENV HF_TOKEN=""
ENV PYTHONUNBUFFERED=1

# Install Python 3.10 and build tools
RUN apt-get update && apt-get install -y \
    python3.10 \
    python3-pip \
    build-essential \
    cmake \
    pkg-config \
    curl \
    git \
    && rm -rf /var/lib/apt/lists/*

# Set python3.10 as default
RUN ln -sf /usr/bin/python3.10 /usr/bin/python && ln -sf /usr/bin/python3.10 /usr/bin/python3

COPY requirements.txt .

RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir -r requirements.txt

# Note: llama-cpp-python will be installed at runtime (see app/main.py)
# This avoids long build times and complex CUDA setup during build

# Model downloads are deferred to first request to speed up build time
# They will be downloaded on first API call via app/models/registry.py
# This makes builds fast while still pre-caching models on subsequent deployments

COPY . .

EXPOSE 8000

CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]