LUNA-Training / setup_and_sft.sh
ASTERIZER's picture
Upload setup_and_sft.sh with huggingface_hub
88d5091 verified
#!/usr/bin/env bash
# =============================================================================
# LUNA 100M β€” SFT Cloud Setup & Train Entrypoint
# Runs on RunPod, Vast.ai, Lambda Labs, or any Linux GPU pod.
#
# USAGE:
# bash setup_and_sft.sh [huggingface|local] [HF_REPO_OR_PATH]
#
# EXAMPLES:
# # From HuggingFace (recommended):
# bash setup_and_sft.sh huggingface ASTERIZER/LUNA
#
# # Already cloned locally:
# bash setup_and_sft.sh local /workspace/LUNA
# =============================================================================
set -e
DATA_SOURCE="${1:-huggingface}"
REPO_ID="${2:-ASTERIZER/LUNA}"
echo "=========================================="
echo " LUNA 100M β€” SFT Setup"
echo " Source: $DATA_SOURCE"
echo " Repo : $REPO_ID"
echo "=========================================="
# ── 1. Python packages ────────────────────────────────────────────────────────
echo ""
echo "[1/4] Installing dependencies..."
pip install -q --upgrade pip
pip install -q \
torch torchvision \
psutil \
pyyaml \
transformers \
huggingface_hub \
datasets 2>/dev/null || true
echo " Done."
# ── 2. Clone repo / fetch data ────────────────────────────────────────────────
echo ""
echo "[2/4] Fetching repository..."
WORK_DIR="/workspace/LUNA"
if [ "$DATA_SOURCE" = "huggingface" ]; then
if [ ! -d "$WORK_DIR" ]; then
# Clone the HF Space as a git repo
HF_TOKEN="${HF_TOKEN:-}"
if [ -n "$HF_TOKEN" ]; then
git clone "https://huggingface.co/spaces/$REPO_ID" "$WORK_DIR" || \
python -c "
from huggingface_hub import snapshot_download
snapshot_download(repo_id='$REPO_ID', repo_type='space', local_dir='$WORK_DIR',
token='$HF_TOKEN' if '$HF_TOKEN' else None)
"
else
git clone "https://huggingface.co/spaces/$REPO_ID" "$WORK_DIR" || \
python -c "
from huggingface_hub import snapshot_download
snapshot_download(repo_id='$REPO_ID', repo_type='space', local_dir='$WORK_DIR')
"
fi
else
echo " $WORK_DIR already exists, pulling latest..."
cd "$WORK_DIR" && git pull || echo " (not a git repo, using existing)"
fi
elif [ "$DATA_SOURCE" = "local" ]; then
WORK_DIR="$REPO_ID"
fi
cd "$WORK_DIR"
echo " Working dir: $(pwd)"
# ── 3. System probe ──────────────────────────────────────────────────────────
echo ""
echo "[3/4] System probe..."
python -c "
import torch, psutil, os
props = torch.cuda.get_device_properties(0) if torch.cuda.is_available() else None
print(f' GPU : {props.name if props else \"CPU only\"} ({props.total_memory/1024**3:.1f} GB)' if props else ' GPU: None')
print(f' RAM : {psutil.virtual_memory().total/1024**3:.1f} GB')
print(f' CPUs : {os.cpu_count()}')
"
# Verify files exist
echo ""
echo " Checking required files..."
for f in sft_train.py sft_config.yaml Base/Datasets/sft_clean/train.json; do
if [ -f "$f" ]; then
echo " βœ“ $f"
else
echo " βœ— $f MISSING!"
fi
done
echo " (Pretrained checkpoint will be auto-downloaded from HuggingFace if not present)"
# ── 4. Train SFT ─────────────────────────────────────────────────────────────
echo ""
echo "[4/4] Starting SFT training..."
echo ""
python sft_train.py \
--config sft_config.yaml
echo ""
echo "=========================================="
echo " SFT complete! Output: Base/out/sft/luna_100m_sft"
echo "=========================================="