ASTERIZER
/

LUNA-Training

Model card Files Files and versions

xet

Community

ASTERIZER commited on Apr 2

Commit

95e6f4e

verified ·

1 Parent(s): 5411740

Upload gpu_train.sh with huggingface_hub

Browse files

Files changed (1) hide show

gpu_train.sh +112 -0

gpu_train.sh ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/env bash
+# ============================================================================
+#  LUNA 100M — LoRA SFT on RAG/MCP data (GPU instance one-shot script)
+# ============================================================================
+#  Clones code from HF, downloads the SFT model + dataset, runs LoRA training.
+#
+#  Usage on a fresh GPU instance (RunPod / Lambda / Vast.ai / etc.):
+#    export HF_TOKEN="hf_your_token_here"
+#    bash gpu_train.sh
+# ============================================================================
+set -euo pipefail
+HF_TOKEN="${HF_TOKEN:?Set HF_TOKEN env var}"
+CODE_REPO="ASTERIZER/LUNA-Training"
+MODEL_REPO="ASTERIZER/LUNA-100M"
+DATASET_REPO="ASTERIZER/LUNA-RAG-MCP-SFT-10M"
+WORK_DIR="/workspace/luna"
+echo "============================================================"
+echo "  LUNA 100M — LoRA SFT (RAG/MCP)  —  GPU Setup"
+echo "============================================================"
+# ── 1. System deps ──────────────────────────────────────────────
+echo "[1/6] Installing system dependencies..."
+apt-get update -qq && apt-get install -y -qq git git-lfs python3-pip > /dev/null 2>&1
+git lfs install --skip-smudge > /dev/null 2>&1
+# ── 2. Clone code ──────────────────────────────────────────────
+echo "[2/6] Cloning training code from $CODE_REPO..."
+mkdir -p "$WORK_DIR"
+cd "$WORK_DIR"
+if [ ! -f "lora_sft_train.py" ]; then
+    pip install -q huggingface_hub
+    python3 -c "
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id='${CODE_REPO}',
+    local_dir='${WORK_DIR}',
+    token='${HF_TOKEN}',
+)
+print('Code downloaded.')
+"
+fi
+# ── 3. Python deps ─────────────────────────────────────────────
+echo "[3/6] Installing Python dependencies..."
+pip install -q torch --index-url https://download.pytorch.org/whl/cu121 2>/dev/null || true
+pip install -q -r requirements.txt 2>/dev/null
+# ── 4. Download SFT model checkpoint ──────────────────────────
+echo "[4/6] Downloading SFT base model from $MODEL_REPO..."
+python3 -c "
+import os
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+ckpt_dir = Path('Base/out/input_models/luna_sft_v1')
+target = ckpt_dir / 'sft_v1' / 'final' / 'model.pth'
+if target.exists():
+    print(f'Checkpoint already exists: {target}')
+else:
+    ckpt_dir.mkdir(parents=True, exist_ok=True)
+    hf_hub_download(
+        repo_id='${MODEL_REPO}',
+        filename='sft_v1/final/model.pth',
+        local_dir=str(ckpt_dir),
+        token=os.environ.get('HF_TOKEN'),
+    )
+    print('Model downloaded.')
+"
+# ── 5. Download RAG/MCP SFT dataset ───────────────────────────
+echo "[5/6] Downloading RAG/MCP dataset from $DATASET_REPO..."
+python3 -c "
+import os
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+data_dir = Path('Base/Datasets/rag_mcp_sft')
+data_dir.mkdir(parents=True, exist_ok=True)
+for fname in ['train.json', 'val.json']:
+    target = data_dir / fname
+    if target.exists():
+        print(f'Already exists: {target}')
+        continue
+    hf_hub_download(
+        repo_id='${DATASET_REPO}',
+        filename=fname,
+        local_dir=str(data_dir),
+        token=os.environ.get('HF_TOKEN'),
+    )
+    print(f'Downloaded: {fname}')
+"
+# ── 6. Launch LoRA SFT training ───────────────────────────────
+echo "[6/6] Starting LoRA SFT training..."
+echo "============================================================"
+nvidia-smi --query-gpu=name,memory.total --format=csv,noheader || true
+echo ""
+CUDA_VISIBLE_DEVICES=0 python3 lora_sft_train.py \
+    --config rag_mcp_lora_config.yaml
+echo "============================================================"
+echo "  Training complete!"
+echo "  Adapter saved to: Base/out/sft/rag_mcp_lora/final/"
+echo "============================================================"