@echo off REM ============================================================================ REM ULTRATHINK Perfect Training Script (Windows) REM ============================================================================ REM Fixes: Routing collapse, high perplexity, auxiliary loss issues REM Optimized for: Small-scale model (512 hidden, 6 layers) on single GPU REM ============================================================================ echo ======================================== echo ULTRATHINK Perfect Training echo ======================================== REM Check if train_ultrathink.py exists if not exist "train_ultrathink.py" ( echo Error: train_ultrathink.py not found! echo Please run this script from the project root directory. exit /b 1 ) REM Check GPU availability where nvidia-smi >nul 2>&1 if %ERRORLEVEL% EQU 0 ( echo GPU detected: nvidia-smi --query-gpu=name,memory.total --format=csv,noheader ) else ( echo Warning: No GPU detected. Training will be slow on CPU. ) REM Create output directory if not exist ".\outputs\ultrathink_fixed" mkdir ".\outputs\ultrathink_fixed" echo Starting training with optimized configuration... echo. REM ============================================================================ REM MAIN TRAINING COMMAND - PERFECT CONFIGURATION REM ============================================================================ python train_ultrathink.py ^ --vocab_size 50257 ^ --hidden_size 512 ^ --num_layers 6 ^ --num_heads 8 ^ --num_kv_heads 4 ^ --intermediate_size 2048 ^ --max_seq_length 256 ^ --activation swiglu ^ --enable_moe ^ --num_knowledge_experts 4 ^ --num_skill_experts 2 ^ --num_meta_experts 1 ^ --num_safety_experts 1 ^ --moe_top_k 2 ^ --expert_capacity 1.5 ^ --load_balance_weight 0.1 ^ --z_loss_weight 0.0001 ^ --importance_weight 0.05 ^ --batch_size 2 ^ --gradient_accumulation_steps 32 ^ --learning_rate 0.0001 ^ --weight_decay 0.1 ^ --adam_beta1 0.9 ^ --adam_beta2 0.999 ^ --warmup_steps 1000 ^ --max_steps 100000 ^ --num_epochs 1 ^ --gradient_clipping 0.5 ^ --dropout 0.15 ^ --attention_dropout 0.15 ^ --gradient_checkpointing ^ --use_amp ^ --amp_warmup_steps 500 ^ --enable_dre ^ --dre_warmup_steps 1000 ^ --dataset c4 ^ --dataset_subset en ^ --tokenizer_name gpt2 ^ --streaming ^ --train_samples 10000 ^ --val_samples 1000 ^ --num_workers 2 ^ --use_mlflow ^ --mlflow_tracking_uri "file:./mlruns" ^ --mlflow_experiment "UltraThinking-LLM-Training" ^ --run_name "ultrathink_fixed_routing_v2" ^ --perf_log_interval 5 ^ --eval_frequency 50 ^ --output_dir "./outputs/ultrathink_fixed" if %ERRORLEVEL% EQU 0 ( echo. echo ======================================== echo Training completed successfully! echo ======================================== echo. echo Output directory: .\outputs\ultrathink_fixed echo MLflow logs: .\mlruns echo. echo To view training metrics: echo mlflow ui --backend-store-uri ./mlruns --port 5000 echo. ) else ( echo. echo ======================================== echo Training failed! echo ======================================== echo. echo Check the logs above for error details. exit /b 1 )