WCNegentropy
/

BitTransformerLM

Text Generation

language-modeling

Model card Files Files and versions

BitTransformerLM / launch_true_1b.sh

WCNegentropy's picture

🤖 Updated BitTransformerLM from development space

36c78b1 verified 8 months ago

2.02 kB

	#!/bin/bash
	#
	# Launch TRUE 1.21B Parameter BitTransformerLM Training
	# ====================================================
	#
	# PROPER FSDP sharding across 4 GPUs + inference testing!
	#

	set -e

	echo "🔥 TRUE 1.21B PARAMETER BITTRANSFORMERLM TRAINING"
	echo "================================================="
	echo "🎯 PROPER FSDP SHARDING (not duplication!)"
	echo "✅ Based on proven 680M success"
	echo "🚀 Full training + inference testing"
	echo ""

	# Optimal environment setup
	export CUDA_VISIBLE_DEVICES=0,1,2,3
	export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
	export OMP_NUM_THREADS=12
	export HF_TOKEN="${HF_TOKEN:-your-token-here}"

	cd /data/BitTransformerLM/BitTransformerLM

	echo "🔍 Hardware Check:"
	python -c "
	import torch
	print(f'CUDA Available: {torch.cuda.is_available()}')
	print(f'GPU Count: {torch.cuda.device_count()}')
	for i in range(torch.cuda.device_count()):
	props = torch.cuda.get_device_properties(i)
	print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)')
	print(f'Total VRAM: {sum(torch.cuda.get_device_properties(i).total_memory for i in range(torch.cuda.device_count())) / 1024**3:.1f}GB')
	"

	echo ""
	echo "⚙️ TRUE 1.21B CONFIGURATION:"
	echo " 🎯 Parameters: 1,210,000,000+ (1.21B)"
	echo " 📐 Architecture: d_model=2048, layers=24, heads=32"
	echo " 🧠 Memory Strategy: FSDP Full Sharding across 4 GPUs"
	echo " 🔄 Sequence Length: 512 (optimized from 680M success)"
	echo " ⚡ Mixed Precision: FP16"
	echo " 🛡️ Safety Telemetry: K, C, S metrics enabled"
	echo " 🔧 All Optimizations: Reversible + Checkpointing + Chunked Attention"
	echo ""

	echo "🚀 Starting TRUE 1.21B parameter training..."
	echo " This WILL work - we've proven the capability!"
	echo ""

	# Launch training
	python true_1b_training.py

	echo ""
	echo "🏆 TRUE 1.21B BITTRANSFORMERLM TRAINING COMPLETED!"
	echo "📊 Check /data/true_1b_results.json for full results"
	echo "💾 Model checkpoint saved for inference"
	echo "🧪 Inference testing completed"