Create scripts/pretrain.sh
Browse files- scripts/pretrain.sh +29 -0
scripts/pretrain.sh
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# ARAVALLI-1 Sovereign Pre-training Pipeline
|
| 3 |
+
# Status: CATEGORY 1-SN | Authority: GOEC-Secretariat
|
| 4 |
+
|
| 5 |
+
echo "----------------------------------------------------------------"
|
| 6 |
+
echo "INITIATING ARAVALLI-1 BIRTH CYCLE: GOEC SOVEREIGN AI"
|
| 7 |
+
echo "----------------------------------------------------------------"
|
| 8 |
+
|
| 9 |
+
# 1. Ingestion Phase
|
| 10 |
+
echo "[STEP 1/4] Ingesting Global & Indigenous Data..."
|
| 11 |
+
python3 data/scripts/scraper.py
|
| 12 |
+
|
| 13 |
+
# 2. Refinement Phase
|
| 14 |
+
echo "[STEP 2/4] Refining & Hashing Sovereign Corpus..."
|
| 15 |
+
python3 data/scripts/cleaner.py
|
| 16 |
+
|
| 17 |
+
# 3. Linguistic Evolution
|
| 18 |
+
echo "[STEP 3/4] Training Sovereign Tokenizer (Indic-BPE)..."
|
| 19 |
+
python3 data/tokenizer_train.py
|
| 20 |
+
|
| 21 |
+
# 4. Neural Forging
|
| 22 |
+
echo "[STEP 4/4] Commencing Scratch Pre-training (1.2B Parameters)..."
|
| 23 |
+
# Setting environment for multi-GPU training if available
|
| 24 |
+
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
| 25 |
+
python3 src/training/trainer.py --config config/model_config.yaml
|
| 26 |
+
|
| 27 |
+
echo "----------------------------------------------------------------"
|
| 28 |
+
echo "SOVEREIGN BIRTH CYCLE COMPLETE. MODEL SEALING INITIATED."
|
| 29 |
+
echo "----------------------------------------------------------------"
|