compute_environment: LOCAL_MACHINE distributed_type: MULTI_GPU downcast_bf16: 'no' gpu_ids: all machine_rank: 0 main_process_ip: 'xx.xx.xx.xx' main_process_port: 36001 main_training_function: main mixed_precision: 'bf16' num_machines: 2 num_processes: 16 rdzv_backend: static same_network: true tpu_use_cluster: false tpu_use_sudo: false use_cpu: false # Optional DeepSpeed ZeRO-0 (no sharding). Prefer MULTI_GPU DDP for single-node training. deepspeed_config: zero_stage: 0 gradient_accumulation_steps: 1 zero3_init_flag: false