ASTERIZER commited on
Commit
95e6f4e
Β·
verified Β·
1 Parent(s): 5411740

Upload gpu_train.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. gpu_train.sh +112 -0
gpu_train.sh ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # ============================================================================
3
+ # LUNA 100M β€” LoRA SFT on RAG/MCP data (GPU instance one-shot script)
4
+ # ============================================================================
5
+ # Clones code from HF, downloads the SFT model + dataset, runs LoRA training.
6
+ #
7
+ # Usage on a fresh GPU instance (RunPod / Lambda / Vast.ai / etc.):
8
+ # export HF_TOKEN="hf_your_token_here"
9
+ # bash gpu_train.sh
10
+ # ============================================================================
11
+
12
+ set -euo pipefail
13
+
14
+ HF_TOKEN="${HF_TOKEN:?Set HF_TOKEN env var}"
15
+
16
+ CODE_REPO="ASTERIZER/LUNA-Training"
17
+ MODEL_REPO="ASTERIZER/LUNA-100M"
18
+ DATASET_REPO="ASTERIZER/LUNA-RAG-MCP-SFT-10M"
19
+ WORK_DIR="/workspace/luna"
20
+
21
+ echo "============================================================"
22
+ echo " LUNA 100M β€” LoRA SFT (RAG/MCP) β€” GPU Setup"
23
+ echo "============================================================"
24
+
25
+ # ── 1. System deps ──────────────────────────────────────────────
26
+ echo "[1/6] Installing system dependencies..."
27
+ apt-get update -qq && apt-get install -y -qq git git-lfs python3-pip > /dev/null 2>&1
28
+ git lfs install --skip-smudge > /dev/null 2>&1
29
+
30
+ # ── 2. Clone code ──────────────────────────────────────────────
31
+ echo "[2/6] Cloning training code from $CODE_REPO..."
32
+ mkdir -p "$WORK_DIR"
33
+ cd "$WORK_DIR"
34
+
35
+ if [ ! -f "lora_sft_train.py" ]; then
36
+ pip install -q huggingface_hub
37
+ python3 -c "
38
+ from huggingface_hub import snapshot_download
39
+ snapshot_download(
40
+ repo_id='${CODE_REPO}',
41
+ local_dir='${WORK_DIR}',
42
+ token='${HF_TOKEN}',
43
+ )
44
+ print('Code downloaded.')
45
+ "
46
+ fi
47
+
48
+ # ── 3. Python deps ─────────────────────────────────────────────
49
+ echo "[3/6] Installing Python dependencies..."
50
+ pip install -q torch --index-url https://download.pytorch.org/whl/cu121 2>/dev/null || true
51
+ pip install -q -r requirements.txt 2>/dev/null
52
+
53
+ # ── 4. Download SFT model checkpoint ──────────────────────────
54
+ echo "[4/6] Downloading SFT base model from $MODEL_REPO..."
55
+ python3 -c "
56
+ import os
57
+ from pathlib import Path
58
+ from huggingface_hub import hf_hub_download
59
+
60
+ ckpt_dir = Path('Base/out/input_models/luna_sft_v1')
61
+ target = ckpt_dir / 'sft_v1' / 'final' / 'model.pth'
62
+ if target.exists():
63
+ print(f'Checkpoint already exists: {target}')
64
+ else:
65
+ ckpt_dir.mkdir(parents=True, exist_ok=True)
66
+ hf_hub_download(
67
+ repo_id='${MODEL_REPO}',
68
+ filename='sft_v1/final/model.pth',
69
+ local_dir=str(ckpt_dir),
70
+ token=os.environ.get('HF_TOKEN'),
71
+ )
72
+ print('Model downloaded.')
73
+ "
74
+
75
+ # ── 5. Download RAG/MCP SFT dataset ───────────────────────────
76
+ echo "[5/6] Downloading RAG/MCP dataset from $DATASET_REPO..."
77
+ python3 -c "
78
+ import os
79
+ from pathlib import Path
80
+ from huggingface_hub import hf_hub_download
81
+
82
+ data_dir = Path('Base/Datasets/rag_mcp_sft')
83
+ data_dir.mkdir(parents=True, exist_ok=True)
84
+
85
+ for fname in ['train.json', 'val.json']:
86
+ target = data_dir / fname
87
+ if target.exists():
88
+ print(f'Already exists: {target}')
89
+ continue
90
+ hf_hub_download(
91
+ repo_id='${DATASET_REPO}',
92
+ filename=fname,
93
+ local_dir=str(data_dir),
94
+ token=os.environ.get('HF_TOKEN'),
95
+ )
96
+ print(f'Downloaded: {fname}')
97
+ "
98
+
99
+ # ── 6. Launch LoRA SFT training ───────────────────────────────
100
+ echo "[6/6] Starting LoRA SFT training..."
101
+ echo "============================================================"
102
+
103
+ nvidia-smi --query-gpu=name,memory.total --format=csv,noheader || true
104
+ echo ""
105
+
106
+ CUDA_VISIBLE_DEVICES=0 python3 lora_sft_train.py \
107
+ --config rag_mcp_lora_config.yaml
108
+
109
+ echo "============================================================"
110
+ echo " Training complete!"
111
+ echo " Adapter saved to: Base/out/sft/rag_mcp_lora/final/"
112
+ echo "============================================================"