LUNA-Training / gpu_train.sh
ASTERIZER's picture
Upload gpu_train.sh with huggingface_hub
097c451 verified
#!/usr/bin/env bash
# ============================================================================
# LUNA 100M β€” LoRA SFT on RAG/MCP data (GPU instance one-shot script)
# ============================================================================
# Clones code from HF, downloads the SFT model + dataset, runs LoRA training.
#
# Usage on a fresh GPU instance (RunPod / Lambda / Vast.ai / etc.):
# export HF_TOKEN="hf_your_token_here"
# bash gpu_train.sh
# ============================================================================
set -euo pipefail
HF_TOKEN="${HF_TOKEN:?Set HF_TOKEN env var}"
CODE_REPO="ASTERIZER/LUNA-Training"
MODEL_REPO="ASTERIZER/LUNA-100M"
DATASET_REPO="ASTERIZER/LUNA-RAG-MCP-SFT-10M"
WORK_DIR="/workspace/luna"
echo "============================================================"
echo " LUNA 100M β€” LoRA SFT (RAG/MCP) β€” GPU Setup"
echo "============================================================"
# ── 1. System deps ──────────────────────────────────────────────
echo "[1/6] Installing system dependencies..."
apt-get update -qq && apt-get install -y -qq git git-lfs python3-pip > /dev/null 2>&1
git lfs install --skip-smudge > /dev/null 2>&1
# ── 2. Clone code ──────────────────────────────────────────────
echo "[2/6] Cloning training code from $CODE_REPO..."
mkdir -p "$WORK_DIR"
cd "$WORK_DIR"
if [ ! -f "lora_sft_train.py" ] || [ ! -f "upload_lora_to_hf.py" ]; then
pip install -q huggingface_hub
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(
repo_id='${CODE_REPO}',
local_dir='${WORK_DIR}',
token='${HF_TOKEN}',
)
print('Code downloaded.')
"
fi
# ── 3. Python deps ─────────────────────────────────────────────
echo "[3/6] Installing Python dependencies..."
pip install -q torch --index-url https://download.pytorch.org/whl/cu121 2>/dev/null || true
pip install -q -r requirements.txt 2>/dev/null
# ── 4. Download SFT model checkpoint ──────────────────────────
echo "[4/6] Downloading SFT base model from $MODEL_REPO..."
python3 -c "
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
ckpt_dir = Path('Base/out/input_models/luna_sft_v1')
target = ckpt_dir / 'sft_v1' / 'final' / 'model.pth'
if target.exists():
print(f'Checkpoint already exists: {target}')
else:
ckpt_dir.mkdir(parents=True, exist_ok=True)
hf_hub_download(
repo_id='${MODEL_REPO}',
filename='sft_v1/final/model.pth',
local_dir=str(ckpt_dir),
token=os.environ.get('HF_TOKEN'),
)
print('Model downloaded.')
"
# ── 5. Download RAG/MCP SFT dataset ───────────────────────────
echo "[5/6] Downloading RAG/MCP dataset from $DATASET_REPO..."
python3 -c "
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
data_dir = Path('Base/Datasets/rag_mcp_sft')
data_dir.mkdir(parents=True, exist_ok=True)
for fname in ['train.json', 'val.json']:
target = data_dir / fname
if target.exists():
print(f'Already exists: {target}')
continue
hf_hub_download(
repo_id='${DATASET_REPO}',
filename=fname,
local_dir=str(data_dir),
repo_type='dataset',
token=os.environ.get('HF_TOKEN'),
)
print(f'Downloaded: {fname}')
"
# ── 6. Launch LoRA SFT training ───────────────────────────────
echo "[6/6] Starting LoRA SFT training..."
echo "============================================================"
nvidia-smi --query-gpu=name,memory.total --format=csv,noheader || true
echo ""
CUDA_VISIBLE_DEVICES=0 python3 lora_sft_train.py \
--config rag_mcp_lora_config.yaml
echo "============================================================"
echo " Training complete!"
echo " Adapter saved to: Base/out/sft/rag_mcp_lora/final/"
echo " Full run folder : Base/out/sft/rag_mcp_lora/"
echo " To upload it to Hugging Face, run:"
echo " python3 upload_lora_to_hf.py --repo-id ASTERIZER/LUNA-100M --folder Base/out/sft/rag_mcp_lora --path-in-repo rag_mcp_lora"
if [ "${UPLOAD_TO_HF:-0}" = "1" ]; then
echo " UPLOAD_TO_HF=1 detected. Uploading adapter to Hugging Face..."
if [ -f "upload_lora_to_hf.py" ]; then
python3 upload_lora_to_hf.py \
--repo-id ASTERIZER/LUNA-100M \
--folder Base/out/sft/rag_mcp_lora \
--path-in-repo rag_mcp_lora
else
python3 -c "
import os
from pathlib import Path
from huggingface_hub import HfApi
folder = Path('Base/out/sft/rag_mcp_lora')
required = [folder / 'final' / 'adapter_model.pt', folder / 'final' / 'adapter_bundle.pt']
missing = [str(path) for path in required if not path.exists()]
if missing:
raise FileNotFoundError('Missing expected adapter files: ' + ', '.join(missing))
api = HfApi(token=os.environ['HF_TOKEN'])
api.create_repo(repo_id='ASTERIZER/LUNA-100M', repo_type='model', exist_ok=True)
api.upload_folder(
repo_id='ASTERIZER/LUNA-100M',
repo_type='model',
folder_path=str(folder),
path_in_repo='rag_mcp_lora',
)
print('uploaded_lora url=https://huggingface.co/ASTERIZER/LUNA-100M/tree/main/rag_mcp_lora')
"
fi
fi
echo "============================================================"