# Use a lightweight Python base FROM python:3.10-slim # 1. Install system dependencies and jemalloc (prevents RAM fragmentation) RUN apt-get update && apt-get install -y \ build-essential \ libjemalloc-dev \ git \ && rm -rf /var/lib/apt/lists/* # Set jemalloc as the memory allocator (crucial for CPU-only training) ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libjemalloc.so" # 2. Setup user for HF Spaces (non-root) RUN useradd -m -u 1000 user USER user ENV HOME=/home/user \ PATH=/home/user/.local/bin:$PATH WORKDIR $HOME/app # 3. Install Python dependencies # IPEX is key for CPU speedup on Intel Xeon (common in HF Spaces) RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir \ torch \ intel-extension-for-pytorch \ transformers \ datasets \ accelerate \ trl \ sentencepiece # 4. Copy your training script and local files COPY --chown=user . $HOME/app # 5. Set Environment Variables for CPU Threading # Matches the standard 2vCPU or 4vCPU Space tiers ENV OMP_NUM_THREADS=2 \ MKL_NUM_THREADS=2 \ USE_CPU=1 # 6. Run the training script CMD ["python", "train.py"]