File size: 3,691 Bytes
5411740
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env bash
# =============================================================================
#  LUNA 100M β€” Cloud Setup & Train Entrypoint
#  Runs on RunPod, Vast.ai, Lambda Labs, or any Linux GPU pod.
#
#  USAGE (after cloning repo):
#    bash setup_and_train.sh [gdrive|huggingface] [SOURCE_ID] [MAX_TOKENS]
#
#  EXAMPLES:
#    # Full dataset from Google Drive folder:
#    bash setup_and_train.sh gdrive 1AbCdEfGhIjKlMnOpQrStUvWx
#
#    # Full dataset from HuggingFace:
#    bash setup_and_train.sh huggingface ASTERIZER/Luna_Dataset
#
#    # Quick smoke test (10M tokens only):
#    bash setup_and_train.sh huggingface ASTERIZER/Luna_Dataset 10000000
#
#    # Dataset already on disk:
#    bash setup_and_train.sh local /workspace/data/litdata_pretrain_final
# =============================================================================

set -e

DATA_SOURCE="${1:-local}"
DATA_ID="${2:-Base/data/litdata_pretrain_final}"
MAX_TOKENS="${3:-4515286950}"
DATA_DIR="/workspace/data/litdata_pretrain_final"
OUT_DIR="/workspace/out/pretrain/luna-100m"

echo "=========================================="
echo "  LUNA 100M β€” Cloud Setup"
echo "  Source  : $DATA_SOURCE"
echo "  ID/Path : $DATA_ID"
echo "  Tokens  : $MAX_TOKENS"
echo "=========================================="

# ── 1. Python packages ────────────────────────────────────────────────────────
echo ""
echo "[1/4] Installing dependencies..."

pip install -q --upgrade pip
pip install -q \
    torch torchvision \
    psutil \
    huggingface_hub \
    gdown \
    tensorboard \
    litgpt 2>/dev/null || true

echo "      Done."

# ── 2. Download dataset ───────────────────────────────────────────────────────
echo ""
echo "[2/4] Fetching dataset..."

if [ "$DATA_SOURCE" = "gdrive" ]; then
    python fetch_data.py --source gdrive --gdrive_id "$DATA_ID" --out_dir "$DATA_DIR"
elif [ "$DATA_SOURCE" = "huggingface" ]; then
    HF_TOKEN="${HF_TOKEN:-}"
    python fetch_data.py --source huggingface --hf_repo "$DATA_ID" --out_dir "$DATA_DIR" --hf_token "$HF_TOKEN"
elif [ "$DATA_SOURCE" = "local" ]; then
    python fetch_data.py --source local --local_path "$DATA_ID" --out_dir "$DATA_DIR"
else
    echo "Unknown source: $DATA_SOURCE (use: gdrive | huggingface | local)"
    exit 1
fi

# ── 3. System + batch size probe ──────────────────────────────────────────────
echo ""
echo "[3/4] System probe (auto-detects VRAM, RAM, CPU)..."
python -c "
import torch, psutil, os
props = torch.cuda.get_device_properties(0) if torch.cuda.is_available() else None
print(f'  GPU  : {props.name if props else \"CPU only\"} ({props.total_memory/1024**3:.1f} GB)' if props else '  GPU: None')
print(f'  RAM  : {psutil.virtual_memory().total/1024**3:.1f} GB')
print(f'  CPUs : {os.cpu_count()}')
"

# ── 4. Train ──────────────────────────────────────────────────────────────────
echo ""
echo "[4/4] Starting training (auto_config reads from train_config.yaml)..."
echo ""

python train.py \
    --config train_config.yaml \
    --data_path "$DATA_DIR" \
    --out_dir "$OUT_DIR" \
    --max_tokens "$MAX_TOKENS"

echo ""
echo "=========================================="
echo "  Training complete!  Output: $OUT_DIR"
echo "=========================================="