# GPU-enabled Dockerfile (works on both GPU and CPU hardware) # Uses NVIDIA CUDA base image for optimal performance on GPU # Falls back gracefully to CPU if GPU not available FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04 WORKDIR /app ENV MODEL_DIR=/app/pretrain_model ENV HF_HUB_DISABLE_SYMLINKS_WARNING=1 ENV HF_TOKEN="" ENV PYTHONUNBUFFERED=1 # Install Python 3.10 and build tools RUN apt-get update && apt-get install -y \ python3.10 \ python3-pip \ build-essential \ cmake \ pkg-config \ curl \ git \ && rm -rf /var/lib/apt/lists/* # Set python3.10 as default RUN ln -sf /usr/bin/python3.10 /usr/bin/python && ln -sf /usr/bin/python3.10 /usr/bin/python3 COPY requirements.txt . RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir -r requirements.txt # Note: llama-cpp-python will be installed at runtime (see app/main.py) # This avoids long build times and complex CUDA setup during build # Model downloads are deferred to first request to speed up build time # They will be downloaded on first API call via app/models/registry.py # This makes builds fast while still pre-caching models on subsequent deployments COPY . . EXPOSE 8000 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]