glaive-7b-training / Dockerfile
Hajime MATSUMOTO
Fix permission: use /tmp/training as workdir with proper permissions
ce5bcf8
raw
history blame contribute delete
748 Bytes
FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel
# 書き込み可能なワークディレクトリ
WORKDIR /tmp/training
# 基本パッケージ
RUN apt-get update && apt-get install -y \
git \
curl \
&& rm -rf /var/lib/apt/lists/*
# Python依存関係
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 学習スクリプト
COPY train.py .
COPY train_multi_gpu.py .
# HFトークンは環境変数で渡す
ENV HF_TOKEN=""
ENV HF_HOME=/tmp/hf_cache
ENV TRANSFORMERS_CACHE=/tmp/hf_cache
# ディレクトリ作成
RUN mkdir -p /tmp/hf_cache /tmp/training/checkpoints /tmp/training/output && \
chmod -R 777 /tmp/hf_cache /tmp/training
# シングルGPU学習 (L40S 48GB)
CMD ["python", "train.py"]