Upload gpu_train.sh with huggingface_hub
Browse files- gpu_train.sh +112 -0
gpu_train.sh
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# ============================================================================
|
| 3 |
+
# LUNA 100M β LoRA SFT on RAG/MCP data (GPU instance one-shot script)
|
| 4 |
+
# ============================================================================
|
| 5 |
+
# Clones code from HF, downloads the SFT model + dataset, runs LoRA training.
|
| 6 |
+
#
|
| 7 |
+
# Usage on a fresh GPU instance (RunPod / Lambda / Vast.ai / etc.):
|
| 8 |
+
# export HF_TOKEN="hf_your_token_here"
|
| 9 |
+
# bash gpu_train.sh
|
| 10 |
+
# ============================================================================
|
| 11 |
+
|
| 12 |
+
set -euo pipefail
|
| 13 |
+
|
| 14 |
+
HF_TOKEN="${HF_TOKEN:?Set HF_TOKEN env var}"
|
| 15 |
+
|
| 16 |
+
CODE_REPO="ASTERIZER/LUNA-Training"
|
| 17 |
+
MODEL_REPO="ASTERIZER/LUNA-100M"
|
| 18 |
+
DATASET_REPO="ASTERIZER/LUNA-RAG-MCP-SFT-10M"
|
| 19 |
+
WORK_DIR="/workspace/luna"
|
| 20 |
+
|
| 21 |
+
echo "============================================================"
|
| 22 |
+
echo " LUNA 100M β LoRA SFT (RAG/MCP) β GPU Setup"
|
| 23 |
+
echo "============================================================"
|
| 24 |
+
|
| 25 |
+
# ββ 1. System deps ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
echo "[1/6] Installing system dependencies..."
|
| 27 |
+
apt-get update -qq && apt-get install -y -qq git git-lfs python3-pip > /dev/null 2>&1
|
| 28 |
+
git lfs install --skip-smudge > /dev/null 2>&1
|
| 29 |
+
|
| 30 |
+
# ββ 2. Clone code ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
+
echo "[2/6] Cloning training code from $CODE_REPO..."
|
| 32 |
+
mkdir -p "$WORK_DIR"
|
| 33 |
+
cd "$WORK_DIR"
|
| 34 |
+
|
| 35 |
+
if [ ! -f "lora_sft_train.py" ]; then
|
| 36 |
+
pip install -q huggingface_hub
|
| 37 |
+
python3 -c "
|
| 38 |
+
from huggingface_hub import snapshot_download
|
| 39 |
+
snapshot_download(
|
| 40 |
+
repo_id='${CODE_REPO}',
|
| 41 |
+
local_dir='${WORK_DIR}',
|
| 42 |
+
token='${HF_TOKEN}',
|
| 43 |
+
)
|
| 44 |
+
print('Code downloaded.')
|
| 45 |
+
"
|
| 46 |
+
fi
|
| 47 |
+
|
| 48 |
+
# ββ 3. Python deps βββββββββββββββββββββββββββββββββββββββββββββ
|
| 49 |
+
echo "[3/6] Installing Python dependencies..."
|
| 50 |
+
pip install -q torch --index-url https://download.pytorch.org/whl/cu121 2>/dev/null || true
|
| 51 |
+
pip install -q -r requirements.txt 2>/dev/null
|
| 52 |
+
|
| 53 |
+
# ββ 4. Download SFT model checkpoint ββββββββββββββββββββββββββ
|
| 54 |
+
echo "[4/6] Downloading SFT base model from $MODEL_REPO..."
|
| 55 |
+
python3 -c "
|
| 56 |
+
import os
|
| 57 |
+
from pathlib import Path
|
| 58 |
+
from huggingface_hub import hf_hub_download
|
| 59 |
+
|
| 60 |
+
ckpt_dir = Path('Base/out/input_models/luna_sft_v1')
|
| 61 |
+
target = ckpt_dir / 'sft_v1' / 'final' / 'model.pth'
|
| 62 |
+
if target.exists():
|
| 63 |
+
print(f'Checkpoint already exists: {target}')
|
| 64 |
+
else:
|
| 65 |
+
ckpt_dir.mkdir(parents=True, exist_ok=True)
|
| 66 |
+
hf_hub_download(
|
| 67 |
+
repo_id='${MODEL_REPO}',
|
| 68 |
+
filename='sft_v1/final/model.pth',
|
| 69 |
+
local_dir=str(ckpt_dir),
|
| 70 |
+
token=os.environ.get('HF_TOKEN'),
|
| 71 |
+
)
|
| 72 |
+
print('Model downloaded.')
|
| 73 |
+
"
|
| 74 |
+
|
| 75 |
+
# ββ 5. Download RAG/MCP SFT dataset βββββββββββββββββββββββββββ
|
| 76 |
+
echo "[5/6] Downloading RAG/MCP dataset from $DATASET_REPO..."
|
| 77 |
+
python3 -c "
|
| 78 |
+
import os
|
| 79 |
+
from pathlib import Path
|
| 80 |
+
from huggingface_hub import hf_hub_download
|
| 81 |
+
|
| 82 |
+
data_dir = Path('Base/Datasets/rag_mcp_sft')
|
| 83 |
+
data_dir.mkdir(parents=True, exist_ok=True)
|
| 84 |
+
|
| 85 |
+
for fname in ['train.json', 'val.json']:
|
| 86 |
+
target = data_dir / fname
|
| 87 |
+
if target.exists():
|
| 88 |
+
print(f'Already exists: {target}')
|
| 89 |
+
continue
|
| 90 |
+
hf_hub_download(
|
| 91 |
+
repo_id='${DATASET_REPO}',
|
| 92 |
+
filename=fname,
|
| 93 |
+
local_dir=str(data_dir),
|
| 94 |
+
token=os.environ.get('HF_TOKEN'),
|
| 95 |
+
)
|
| 96 |
+
print(f'Downloaded: {fname}')
|
| 97 |
+
"
|
| 98 |
+
|
| 99 |
+
# ββ 6. Launch LoRA SFT training βββββββββββββββββββββββββββββββ
|
| 100 |
+
echo "[6/6] Starting LoRA SFT training..."
|
| 101 |
+
echo "============================================================"
|
| 102 |
+
|
| 103 |
+
nvidia-smi --query-gpu=name,memory.total --format=csv,noheader || true
|
| 104 |
+
echo ""
|
| 105 |
+
|
| 106 |
+
CUDA_VISIBLE_DEVICES=0 python3 lora_sft_train.py \
|
| 107 |
+
--config rag_mcp_lora_config.yaml
|
| 108 |
+
|
| 109 |
+
echo "============================================================"
|
| 110 |
+
echo " Training complete!"
|
| 111 |
+
echo " Adapter saved to: Base/out/sft/rag_mcp_lora/final/"
|
| 112 |
+
echo "============================================================"
|