File size: 5,627 Bytes
95e6f4e ec8d28a 95e6f4e 2b33315 95e6f4e 097c451 076e1b4 097c451 076e1b4 ec8d28a 097c451 ec8d28a 097c451 ec8d28a 097c451 ec8d28a 097c451 ec8d28a 076e1b4 95e6f4e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | #!/usr/bin/env bash
# ============================================================================
# LUNA 100M β LoRA SFT on RAG/MCP data (GPU instance one-shot script)
# ============================================================================
# Clones code from HF, downloads the SFT model + dataset, runs LoRA training.
#
# Usage on a fresh GPU instance (RunPod / Lambda / Vast.ai / etc.):
# export HF_TOKEN="hf_your_token_here"
# bash gpu_train.sh
# ============================================================================
set -euo pipefail
HF_TOKEN="${HF_TOKEN:?Set HF_TOKEN env var}"
CODE_REPO="ASTERIZER/LUNA-Training"
MODEL_REPO="ASTERIZER/LUNA-100M"
DATASET_REPO="ASTERIZER/LUNA-RAG-MCP-SFT-10M"
WORK_DIR="/workspace/luna"
echo "============================================================"
echo " LUNA 100M β LoRA SFT (RAG/MCP) β GPU Setup"
echo "============================================================"
# ββ 1. System deps ββββββββββββββββββββββββββββββββββββββββββββββ
echo "[1/6] Installing system dependencies..."
apt-get update -qq && apt-get install -y -qq git git-lfs python3-pip > /dev/null 2>&1
git lfs install --skip-smudge > /dev/null 2>&1
# ββ 2. Clone code ββββββββββββββββββββββββββββββββββββββββββββββ
echo "[2/6] Cloning training code from $CODE_REPO..."
mkdir -p "$WORK_DIR"
cd "$WORK_DIR"
if [ ! -f "lora_sft_train.py" ] || [ ! -f "upload_lora_to_hf.py" ]; then
pip install -q huggingface_hub
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(
repo_id='${CODE_REPO}',
local_dir='${WORK_DIR}',
token='${HF_TOKEN}',
)
print('Code downloaded.')
"
fi
# ββ 3. Python deps βββββββββββββββββββββββββββββββββββββββββββββ
echo "[3/6] Installing Python dependencies..."
pip install -q torch --index-url https://download.pytorch.org/whl/cu121 2>/dev/null || true
pip install -q -r requirements.txt 2>/dev/null
# ββ 4. Download SFT model checkpoint ββββββββββββββββββββββββββ
echo "[4/6] Downloading SFT base model from $MODEL_REPO..."
python3 -c "
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
ckpt_dir = Path('Base/out/input_models/luna_sft_v1')
target = ckpt_dir / 'sft_v1' / 'final' / 'model.pth'
if target.exists():
print(f'Checkpoint already exists: {target}')
else:
ckpt_dir.mkdir(parents=True, exist_ok=True)
hf_hub_download(
repo_id='${MODEL_REPO}',
filename='sft_v1/final/model.pth',
local_dir=str(ckpt_dir),
token=os.environ.get('HF_TOKEN'),
)
print('Model downloaded.')
"
# ββ 5. Download RAG/MCP SFT dataset βββββββββββββββββββββββββββ
echo "[5/6] Downloading RAG/MCP dataset from $DATASET_REPO..."
python3 -c "
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
data_dir = Path('Base/Datasets/rag_mcp_sft')
data_dir.mkdir(parents=True, exist_ok=True)
for fname in ['train.json', 'val.json']:
target = data_dir / fname
if target.exists():
print(f'Already exists: {target}')
continue
hf_hub_download(
repo_id='${DATASET_REPO}',
filename=fname,
local_dir=str(data_dir),
repo_type='dataset',
token=os.environ.get('HF_TOKEN'),
)
print(f'Downloaded: {fname}')
"
# ββ 6. Launch LoRA SFT training βββββββββββββββββββββββββββββββ
echo "[6/6] Starting LoRA SFT training..."
echo "============================================================"
nvidia-smi --query-gpu=name,memory.total --format=csv,noheader || true
echo ""
CUDA_VISIBLE_DEVICES=0 python3 lora_sft_train.py \
--config rag_mcp_lora_config.yaml
echo "============================================================"
echo " Training complete!"
echo " Adapter saved to: Base/out/sft/rag_mcp_lora/final/"
echo " Full run folder : Base/out/sft/rag_mcp_lora/"
echo " To upload it to Hugging Face, run:"
echo " python3 upload_lora_to_hf.py --repo-id ASTERIZER/LUNA-100M --folder Base/out/sft/rag_mcp_lora --path-in-repo rag_mcp_lora"
if [ "${UPLOAD_TO_HF:-0}" = "1" ]; then
echo " UPLOAD_TO_HF=1 detected. Uploading adapter to Hugging Face..."
if [ -f "upload_lora_to_hf.py" ]; then
python3 upload_lora_to_hf.py \
--repo-id ASTERIZER/LUNA-100M \
--folder Base/out/sft/rag_mcp_lora \
--path-in-repo rag_mcp_lora
else
python3 -c "
import os
from pathlib import Path
from huggingface_hub import HfApi
folder = Path('Base/out/sft/rag_mcp_lora')
required = [folder / 'final' / 'adapter_model.pt', folder / 'final' / 'adapter_bundle.pt']
missing = [str(path) for path in required if not path.exists()]
if missing:
raise FileNotFoundError('Missing expected adapter files: ' + ', '.join(missing))
api = HfApi(token=os.environ['HF_TOKEN'])
api.create_repo(repo_id='ASTERIZER/LUNA-100M', repo_type='model', exist_ok=True)
api.upload_folder(
repo_id='ASTERIZER/LUNA-100M',
repo_type='model',
folder_path=str(folder),
path_in_repo='rag_mcp_lora',
)
print('uploaded_lora url=https://huggingface.co/ASTERIZER/LUNA-100M/tree/main/rag_mcp_lora')
"
fi
fi
echo "============================================================"
|