Spaces:
Sleeping
Sleeping
| # GPU-enabled Dockerfile (works on both GPU and CPU hardware) | |
| # Uses NVIDIA CUDA base image for optimal performance on GPU | |
| # Falls back gracefully to CPU if GPU not available | |
| FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04 | |
| WORKDIR /app | |
| ENV MODEL_DIR=/app/pretrain_model | |
| ENV HF_HUB_DISABLE_SYMLINKS_WARNING=1 | |
| ENV HF_TOKEN="" | |
| ENV PYTHONUNBUFFERED=1 | |
| # Install Python 3.10 and build tools | |
| RUN apt-get update && apt-get install -y \ | |
| python3.10 \ | |
| python3-pip \ | |
| build-essential \ | |
| cmake \ | |
| pkg-config \ | |
| curl \ | |
| git \ | |
| && rm -rf /var/lib/apt/lists/* | |
| # Set python3.10 as default | |
| RUN ln -sf /usr/bin/python3.10 /usr/bin/python && ln -sf /usr/bin/python3.10 /usr/bin/python3 | |
| COPY requirements.txt . | |
| RUN pip install --no-cache-dir --upgrade pip && \ | |
| pip install --no-cache-dir -r requirements.txt | |
| # Note: llama-cpp-python will be installed at runtime (see app/main.py) | |
| # This avoids long build times and complex CUDA setup during build | |
| # Model downloads are deferred to first request to speed up build time | |
| # They will be downloaded on first API call via app/models/registry.py | |
| # This makes builds fast while still pre-caching models on subsequent deployments | |
| COPY . . | |
| EXPOSE 8000 | |
| CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"] | |