WCNegentropy
/

BitTransformerLM

@@ -1,74 +0,0 @@
-#!/bin/bash
-#
-# BitTransformerLM OPTIMIZED Massive Scale Training Launcher
-# ==========================================================
-#
-# Launches 680M parameter BitTransformerLM with ALL optimizations enabled!
-# Uses DataParallel for reliable multi-GPU training.
-#
-set -e  # Exit on any error
-echo "🚀 BITTRANSFORMERLM OPTIMIZED MASSIVE SCALE TRAINING"
-echo "====================================================="
-echo "Target: 680 MILLION parameters (CONFIRMED!)"
-echo "Hardware: Multi-GPU with DataParallel"
-echo "Dataset: WikiText-103 with bit-level encoding"
-echo "Optimizations: ALL ENABLED!"
-echo ""
-# Set environment variables for optimal performance
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-export OMP_NUM_THREADS=12
-# Set HuggingFace token
-export HF_TOKEN="${HF_TOKEN:-your-token-here}"
-# Change to BitTransformerLM directory
-cd /data/BitTransformerLM/BitTransformerLM
-# Create checkpoint directory
-mkdir -p /data/checkpoints
-echo "🔍 Hardware Check:"
-python -c "
-import torch
-print(f'CUDA Available: {torch.cuda.is_available()}')
-print(f'GPU Count: {torch.cuda.device_count()}')
-for i in range(torch.cuda.device_count()):
-    props = torch.cuda.get_device_properties(i)
-    print(f'  GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)')
-"
-echo ""
-echo "⚙️ OPTIMIZATIONS ENABLED:"
-echo "  ✅ Reversible Layers (50% memory savings)"
-echo "  ✅ Gradient Checkpointing"
-echo "  ✅ Mixed Precision (FP16)"
-echo "  ✅ Memory-Mapped Dataset Loading"
-echo "  ✅ Safety Telemetry (K, C, S metrics)"
-echo "  ✅ Bit-Native Processing"
-echo "  ✅ DataParallel Multi-GPU"
-echo ""
-echo "📊 Training Configuration:"
-echo "  • Parameters: 679,962,626 (680M)"
-echo "  • Architecture: d_model=1536, layers=24, heads=24"
-echo "  • Batch Size: 2 per GPU"
-echo "  • Gradient Accumulation: 16 steps"
-echo "  • Effective Batch Size: 128"
-echo "  • Learning Rate: 3e-4 with OneCycle"
-echo "  • Dataset: WikiText-103 (2000 training samples)"
-echo ""
-echo "🎯 Starting optimized training..."
-echo "   This version should train successfully!"
-echo ""
-# Launch optimized training
-python massive_scale_simple.py
-echo ""
-echo "🏁 Training completed successfully!"
-echo "Check /data/checkpoints/ for saved models"