File size: 5,627 Bytes
95e6f4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec8d28a
95e6f4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b33315
95e6f4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
097c451
076e1b4
097c451
076e1b4
 
ec8d28a
 
 
097c451
 
ec8d28a
 
 
 
 
 
097c451
 
ec8d28a
 
 
 
 
 
 
 
 
 
097c451
ec8d28a
097c451
ec8d28a
 
076e1b4
95e6f4e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env bash
# ============================================================================
#  LUNA 100M β€” LoRA SFT on RAG/MCP data (GPU instance one-shot script)
# ============================================================================
#  Clones code from HF, downloads the SFT model + dataset, runs LoRA training.
#
#  Usage on a fresh GPU instance (RunPod / Lambda / Vast.ai / etc.):
#    export HF_TOKEN="hf_your_token_here"
#    bash gpu_train.sh
# ============================================================================

set -euo pipefail

HF_TOKEN="${HF_TOKEN:?Set HF_TOKEN env var}"

CODE_REPO="ASTERIZER/LUNA-Training"
MODEL_REPO="ASTERIZER/LUNA-100M"
DATASET_REPO="ASTERIZER/LUNA-RAG-MCP-SFT-10M"
WORK_DIR="/workspace/luna"

echo "============================================================"
echo "  LUNA 100M β€” LoRA SFT (RAG/MCP)  β€”  GPU Setup"
echo "============================================================"

# ── 1. System deps ──────────────────────────────────────────────
echo "[1/6] Installing system dependencies..."
apt-get update -qq && apt-get install -y -qq git git-lfs python3-pip > /dev/null 2>&1
git lfs install --skip-smudge > /dev/null 2>&1

# ── 2. Clone code ──────────────────────────────────────────────
echo "[2/6] Cloning training code from $CODE_REPO..."
mkdir -p "$WORK_DIR"
cd "$WORK_DIR"

if [ ! -f "lora_sft_train.py" ] || [ ! -f "upload_lora_to_hf.py" ]; then
    pip install -q huggingface_hub
    python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id='${CODE_REPO}',
    local_dir='${WORK_DIR}',
    token='${HF_TOKEN}',
)
print('Code downloaded.')
"
fi

# ── 3. Python deps ─────────────────────────────────────────────
echo "[3/6] Installing Python dependencies..."
pip install -q torch --index-url https://download.pytorch.org/whl/cu121 2>/dev/null || true
pip install -q -r requirements.txt 2>/dev/null

# ── 4. Download SFT model checkpoint ──────────────────────────
echo "[4/6] Downloading SFT base model from $MODEL_REPO..."
python3 -c "
import os
from pathlib import Path
from huggingface_hub import hf_hub_download

ckpt_dir = Path('Base/out/input_models/luna_sft_v1')
target = ckpt_dir / 'sft_v1' / 'final' / 'model.pth'
if target.exists():
    print(f'Checkpoint already exists: {target}')
else:
    ckpt_dir.mkdir(parents=True, exist_ok=True)
    hf_hub_download(
        repo_id='${MODEL_REPO}',
        filename='sft_v1/final/model.pth',
        local_dir=str(ckpt_dir),
        token=os.environ.get('HF_TOKEN'),
    )
    print('Model downloaded.')
"

# ── 5. Download RAG/MCP SFT dataset ───────────────────────────
echo "[5/6] Downloading RAG/MCP dataset from $DATASET_REPO..."
python3 -c "
import os
from pathlib import Path
from huggingface_hub import hf_hub_download

data_dir = Path('Base/Datasets/rag_mcp_sft')
data_dir.mkdir(parents=True, exist_ok=True)

for fname in ['train.json', 'val.json']:
    target = data_dir / fname
    if target.exists():
        print(f'Already exists: {target}')
        continue
    hf_hub_download(
        repo_id='${DATASET_REPO}',
        filename=fname,
        local_dir=str(data_dir),
        repo_type='dataset',
        token=os.environ.get('HF_TOKEN'),
    )
    print(f'Downloaded: {fname}')
"

# ── 6. Launch LoRA SFT training ───────────────────────────────
echo "[6/6] Starting LoRA SFT training..."
echo "============================================================"

nvidia-smi --query-gpu=name,memory.total --format=csv,noheader || true
echo ""

CUDA_VISIBLE_DEVICES=0 python3 lora_sft_train.py \
    --config rag_mcp_lora_config.yaml

echo "============================================================"
echo "  Training complete!"
echo "  Adapter saved to: Base/out/sft/rag_mcp_lora/final/"
echo "  Full run folder : Base/out/sft/rag_mcp_lora/"
echo "  To upload it to Hugging Face, run:"
echo "  python3 upload_lora_to_hf.py --repo-id ASTERIZER/LUNA-100M --folder Base/out/sft/rag_mcp_lora --path-in-repo rag_mcp_lora"
if [ "${UPLOAD_TO_HF:-0}" = "1" ]; then
    echo "  UPLOAD_TO_HF=1 detected. Uploading adapter to Hugging Face..."
    if [ -f "upload_lora_to_hf.py" ]; then
        python3 upload_lora_to_hf.py \
            --repo-id ASTERIZER/LUNA-100M \
            --folder Base/out/sft/rag_mcp_lora \
            --path-in-repo rag_mcp_lora
    else
        python3 -c "
import os
from pathlib import Path
from huggingface_hub import HfApi

folder = Path('Base/out/sft/rag_mcp_lora')
required = [folder / 'final' / 'adapter_model.pt', folder / 'final' / 'adapter_bundle.pt']
missing = [str(path) for path in required if not path.exists()]
if missing:
    raise FileNotFoundError('Missing expected adapter files: ' + ', '.join(missing))

api = HfApi(token=os.environ['HF_TOKEN'])
api.create_repo(repo_id='ASTERIZER/LUNA-100M', repo_type='model', exist_ok=True)
api.upload_folder(
    repo_id='ASTERIZER/LUNA-100M',
    repo_type='model',
    folder_path=str(folder),
    path_in_repo='rag_mcp_lora',
)
print('uploaded_lora url=https://huggingface.co/ASTERIZER/LUNA-100M/tree/main/rag_mcp_lora')
"
    fi
fi
echo "============================================================"