diff --git "a/training_log.txt" "b/training_log.txt" new file mode 100644--- /dev/null +++ "b/training_log.txt" @@ -0,0 +1,44623 @@ +W0426 19:45:45.707353 1560894 site-packages/torch/distributed/run.py:766] +W0426 19:45:45.707353 1560894 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 19:45:45.707353 1560894 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 19:45:45.707353 1560894 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-26 19:45:46,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 19:45:46,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 19:45:46,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 19:45:49,911] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 19:45:49,911] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. Using PIL to load images. +04/26/2025 19:45:50 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 19:45:50 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=internvl_chat/zero_stage1_config.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=200, +evaluation_strategy=steps, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=42, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=False, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=4e-05, +length_column_name=length, +load_best_model_at_end=True, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/runs/Apr26_19-45-49_nlp-in-477-l.soe.ucsc.edu, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=eval_loss, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['wandb'], +resume_from_checkpoint=None, +run_name=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=steps, +save_total_limit=2, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.01, +) +04/26/2025 19:45:50 - INFO - __main__ - Loading Tokenizer: pretrained/InternVL2_5-2B +[INFO|tokenization_utils_base.py:2025] 2025-04-26 19:45:50,064 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2025-04-26 19:45:50,064 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 19:45:50,064 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 19:45:50,064 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 19:45:50,064 >> loading file tokenizer.json +[2025-04-26 19:45:50,130] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 19:45:50,130] [INFO] [comm.py:652:init_distributed] cdb=None +[WARNING|logging.py:314] 2025-04-26 19:45:50,171 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +04/26/2025 19:45:50 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 19:45:50 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 19:45:50 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:727] 2025-04-26 19:45:50,259 >> loading configuration file pretrained/InternVL2_5-2B/config.json +[INFO|configuration_utils.py:792] 2025-04-26 19:45:50,260 >> Model config InternVLChatConfig { + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 2048, + "llm_config": { + "_name_or_path": "internlm/internlm2_5-1_8b-chat", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "flash_attention_2", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForSequenceClassification": "modeling_internlm2.InternLM2ForSequenceClassification" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} + +04/26/2025 19:45:50 - INFO - __main__ - Using flash_attention_2 for InternLM +[INFO|modeling_utils.py:3473] 2025-04-26 19:45:50,263 >> loading weights file pretrained/InternVL2_5-2B/model.safetensors +[INFO|modeling_utils.py:1426] 2025-04-26 19:45:50,273 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:826] 2025-04-26 19:45:50,274 >> Generate config GenerationConfig {} + +[INFO|configuration_utils.py:826] 2025-04-26 19:45:50,302 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +[WARNING|logging.py:314] 2025-04-26 19:45:50,326 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2025-04-26 19:45:50,345 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|modeling_utils.py:4350] 2025-04-26 19:45:52,906 >> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4358] 2025-04-26 19:45:52,906 >> All the weights of InternVLChatModel were initialized from the model checkpoint at pretrained/InternVL2_5-2B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:779] 2025-04-26 19:45:52,909 >> loading configuration file pretrained/InternVL2_5-2B/generation_config.json +[INFO|configuration_utils.py:826] 2025-04-26 19:45:52,909 >> Generate config GenerationConfig { + "eos_token_id": [ + 92542, + 92543 + ] +} + +04/26/2025 19:45:53 - INFO - __main__ - Finished +04/26/2025 19:45:53 - INFO - __main__ - model.config.force_image_size: 448 +04/26/2025 19:45:53 - INFO - __main__ - data_args.force_image_size: 448 +04/26/2025 19:45:53 - INFO - __main__ - model.config.vision_config.image_size: 448 +04/26/2025 19:45:53 - INFO - __main__ - [Dataset] num_image_token: 256 +04/26/2025 19:45:53 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/26/2025 19:45:53 - INFO - __main__ - [Dataset] use_thumbnail: True +04/26/2025 19:45:53 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/26/2025 19:45:53 - INFO - __main__ - Formatting inputs...Skip in lazy mode +04/26/2025 19:45:53 - INFO - __main__ - Add dataset: bbox with length: 9967 +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.tok_embeddings.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.0.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.0.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.0.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.0.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.0.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.0.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.0.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.1.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.1.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.1.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.1.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.1.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.1.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.1.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.2.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.2.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.2.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.2.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.2.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.2.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.2.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.3.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.3.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.3.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.3.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.3.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.3.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.3.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.4.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.4.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.4.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.4.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.4.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.4.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.4.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.5.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.5.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.5.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.5.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.5.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.5.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.5.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.6.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.6.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.6.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.6.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.6.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.6.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.6.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.7.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.7.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.7.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.7.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.7.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.7.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.7.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.8.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.8.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.8.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.8.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.8.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.8.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.8.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.9.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.9.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.9.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.9.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.9.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.9.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.9.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.10.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.10.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.10.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.10.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.10.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.10.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.10.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.11.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.11.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.11.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.11.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.11.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.11.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.11.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.12.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.12.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.12.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.12.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.12.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.12.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.12.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.13.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.13.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.13.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.13.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.13.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.13.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.13.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.14.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.14.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.14.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.14.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.14.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.14.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.14.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.15.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.15.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.15.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.15.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.15.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.15.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.15.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.16.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.16.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.16.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.16.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.16.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.16.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.16.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.17.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.17.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.17.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.17.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.17.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.17.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.17.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.18.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.18.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.18.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.18.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.18.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.18.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.18.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.19.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.19.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.19.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.19.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.19.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.19.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.19.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.20.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.20.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.20.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.20.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.20.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.20.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.20.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.21.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.21.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.21.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.21.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.21.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.21.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.21.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.22.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.22.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.22.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.22.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.22.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.22.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.22.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.23.attention.wqkv.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.23.attention.wo.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.23.feed_forward.w1.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.23.feed_forward.w3.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.23.feed_forward.w2.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.23.attention_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.layers.23.ffn_norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.model.norm.weight +04/26/2025 19:45:53 - INFO - __main__ - language_model.output.weight +04/26/2025 19:45:53 - INFO - __main__ - mlp1.0.weight +04/26/2025 19:45:53 - INFO - __main__ - mlp1.0.bias +04/26/2025 19:45:53 - INFO - __main__ - mlp1.1.weight +04/26/2025 19:45:53 - INFO - __main__ - mlp1.1.bias +04/26/2025 19:45:53 - INFO - __main__ - mlp1.3.weight +04/26/2025 19:45:53 - INFO - __main__ - mlp1.3.bias +[INFO|trainer.py:571] 2025-04-26 19:45:53,155 >> Using auto half precision backend +[2025-04-26 19:45:53,342] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.15.4, git-hash=unknown, git-branch=unknown +[2025-04-26 19:45:53,343] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 3 +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +[2025-04-26 19:45:54,632] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file /data/diji/.cache/torch_extensions/py39_cu126/fused_adam/build.ninja... +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.3298048973083496 seconds +Loading extension module fused_adam... +Time to load fused_adam op: 0.40149950981140137 seconds +Loading extension module fused_adam... +Time to load fused_adam op: 0.4016404151916504 seconds +[2025-04-26 19:45:55,486] [INFO] [logging.py:128:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer +[2025-04-26 19:45:55,486] [INFO] [logging.py:128:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2025-04-26 19:45:55,492] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +[2025-04-26 19:45:55,492] [INFO] [utils.py:59:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[2025-04-26 19:45:55,492] [INFO] [logging.py:128:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 1 optimizer +[2025-04-26 19:45:55,492] [INFO] [stage_1_and_2.py:149:__init__] Reduce bucket size 1000000000 +[2025-04-26 19:45:55,492] [INFO] [stage_1_and_2.py:150:__init__] Allgather bucket size 1000000000 +[2025-04-26 19:45:55,492] [INFO] [stage_1_and_2.py:151:__init__] CPU Offload: False +[2025-04-26 19:45:55,492] [INFO] [stage_1_and_2.py:152:__init__] Round robin gradient partitioning: False +[2025-04-26 19:45:59,076] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states +[2025-04-26 19:45:59,076] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 8.03 GB CA 8.4 GB Max_CA 8 GB +[2025-04-26 19:45:59,077] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 75.0 GB, percent = 14.9% +[2025-04-26 19:45:59,235] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states +[2025-04-26 19:45:59,235] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 9.21 GB CA 10.76 GB Max_CA 11 GB +[2025-04-26 19:45:59,235] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 73.95 GB, percent = 14.7% +[2025-04-26 19:45:59,235] [INFO] [stage_1_and_2.py:544:__init__] optimizer state initialized +[2025-04-26 19:45:59,393] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer +[2025-04-26 19:45:59,393] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 6.85 GB CA 10.76 GB Max_CA 11 GB +[2025-04-26 19:45:59,393] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 73.94 GB, percent = 14.7% +[2025-04-26 19:45:59,394] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer +[2025-04-26 19:45:59,394] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler +[2025-04-26 19:45:59,394] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2025-04-26 19:45:59,394] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[[0.9, 0.999]] +[2025-04-26 19:45:59,395] [INFO] [config.py:999:print] DeepSpeedEngine configuration: +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] amp_enabled .................. False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] amp_params ................... False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] bfloat16_enabled ............. True +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] bfloat16_immediate_grad_update False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] comms_config ................. +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] communication_data_type ...... None +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] dataloader_drop_last ......... False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] disable_allgather ............ False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] dump_state ................... False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... None +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] elasticity_enabled ........... False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] fp16_auto_cast ............... None +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] fp16_enabled ................. False +[2025-04-26 19:45:59,396] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] global_rank .................. 0 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] grad_accum_dtype ............. None +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 42 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] gradient_clipping ............ 1.0 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] graph_harvesting ............. False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 1 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] load_universal_checkpoint .... False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] loss_scale ................... 1.0 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] memory_breakdown ............. False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] mics_shard_size .............. -1 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] optimizer_name ............... adamw +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] optimizer_params ............. {'lr': 4e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.01} +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] pld_enabled .................. False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] pld_params ................... False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] prescale_gradients ........... False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] scheduler_name ............... None +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] scheduler_params ............. None +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] seq_parallel_communication_data_type torch.float32 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] sparse_attention ............. None +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] steps_per_print .............. inf +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] train_batch_size ............. 126 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 1 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] use_node_local_storage ....... False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] wall_clock_breakdown ......... True +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] weight_quantization_config ... None +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] world_size ................... 3 +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=1000000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=1000000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] zero_enabled ................. True +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True +[2025-04-26 19:45:59,397] [INFO] [config.py:1003:print] zero_optimization_stage ...... 1 +[2025-04-26 19:45:59,397] [INFO] [config.py:989:print_user_config] json = { + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1.000000e+09, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1.000000e+09, + "contiguous_gradients": true + }, + "fp16": { + "enabled": false, + "auto_cast": true, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": true + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 4e-05, + "betas": [0.9, 0.999], + "eps": 1e-08, + "weight_decay": 0.01 + } + }, + "gradient_accumulation_steps": 42, + "gradient_clipping": 1.0, + "steps_per_print": inf, + "train_batch_size": 126, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": true +} +[INFO|trainer.py:1721] 2025-04-26 19:45:59,397 >> ***** Running training ***** +[INFO|trainer.py:1722] 2025-04-26 19:45:59,398 >> Num examples = 9,967 +[INFO|trainer.py:1723] 2025-04-26 19:45:59,398 >> Num Epochs = 3 +[INFO|trainer.py:1724] 2025-04-26 19:45:59,398 >> Instantaneous batch size per device = 1 +[INFO|trainer.py:1727] 2025-04-26 19:45:59,398 >> Total train batch size (w. parallel, distributed & accumulation) = 126 +[INFO|trainer.py:1728] 2025-04-26 19:45:59,398 >> Gradient Accumulation steps = 42 +[INFO|trainer.py:1729] 2025-04-26 19:45:59,398 >> Total optimization steps = 237 +[INFO|trainer.py:1730] 2025-04-26 19:45:59,398 >> Number of trainable parameters = 1,901,742,080 +[INFO|integration_utils.py:722] 2025-04-26 19:45:59,399 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" +wandb: Currently logged in as: dyang39 to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Tracking run with wandb version 0.19.10 +wandb: Run data is saved locally in /data/diji/InternVL/wandb/run-20250426_194600-j0qrx6xh +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run vivid-elevator-131 +wandb: ⭐️ View project at https://wandb.ai/dyang39/huggingface +wandb: 🚀 View run at https://wandb.ai/dyang39/huggingface/runs/j0qrx6xh + 0%| | 0/237 [00:00, std::allocator >) + 0x98 (0x7fb51ed785e8 in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libc10.so) +frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x23d (0x7fb4c4e2ea1d in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0xc80 (0x7fb4c4e307a0 in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7fb4c4e31ead in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0xe62b3 (0x7fb4b4da12b3 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6) +frame #5: + 0x94ac3 (0x7fb52397aac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #6: + 0x126850 (0x7fb523a0c850 in /usr/lib/x86_64-linux-gnu/libc.so.6) + +terminate called after throwing an instance of 'c10::DistBackendError' + what(): [PG ID 1 PG GUID 1 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=610, OpType=ALLREDUCE, NumelIn=999102464, NumelOut=999102464, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. +Exception raised from checkTimeout at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:635 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x7fb51ed785e8 in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libc10.so) +frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x23d (0x7fb4c4e2ea1d in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0xc80 (0x7fb4c4e307a0 in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7fb4c4e31ead in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0xe62b3 (0x7fb4b4da12b3 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6) +frame #5: + 0x94ac3 (0x7fb52397aac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #6: + 0x126850 (0x7fb523a0c850 in /usr/lib/x86_64-linux-gnu/libc.so.6) + +Exception raised from ncclCommWatchdog at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1902 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x7fb51ed785e8 in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libc10.so) +frame #1: + 0x11b4a6e (0x7fb4c4e00a6e in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #2: + 0xe07bed (0x7fb4c4a53bed in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #3: + 0xe62b3 (0x7fb4b4da12b3 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6) +frame #4: + 0x94ac3 (0x7fb52397aac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #5: + 0x126850 (0x7fb523a0c850 in /usr/lib/x86_64-linux-gnu/libc.so.6) + +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[rank2]:[E426 20:18:38.558586693 ProcessGroupNCCL.cpp:684] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. +[rank2]:[E426 20:18:38.558600770 ProcessGroupNCCL.cpp:698] [Rank 2] To avoid data inconsistency, we are taking the entire process down. +[rank2]:[E426 20:18:38.559468407 ProcessGroupNCCL.cpp:1896] [PG ID 1 PG GUID 1 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=610, OpType=ALLREDUCE, NumelIn=999102464, NumelOut=999102464, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. +Exception raised from checkTimeout at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:635 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x7f7cfc5785e8 in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libc10.so) +frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x23d (0x7f7ca262ea1d in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0xc80 (0x7f7ca26307a0 in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7f7ca2631ead in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0xe62b3 (0x7f7c925a12b3 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6) +frame #5: + 0x94ac3 (0x7f7d011b4ac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #6: + 0x126850 (0x7f7d01246850 in /usr/lib/x86_64-linux-gnu/libc.so.6) + +terminate called after throwing an instance of 'c10::DistBackendError' +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. + what(): [PG ID 1 PG GUID 1 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=610, OpType=ALLREDUCE, NumelIn=999102464, NumelOut=999102464, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. +Exception raised from checkTimeout at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:635 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x7f7cfc5785e8 in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libc10.so) +frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x23d (0x7f7ca262ea1d in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0xc80 (0x7f7ca26307a0 in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7f7ca2631ead in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0xe62b3 (0x7f7c925a12b3 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6) +frame #5: + 0x94ac3 (0x7f7d011b4ac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #6: + 0x126850 (0x7f7d01246850 in /usr/lib/x86_64-linux-gnu/libc.so.6) + +Exception raised from ncclCommWatchdog at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1902 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x7f7cfc5785e8 in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libc10.so) +frame #1: + 0x11b4a6e (0x7f7ca2600a6e in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #2: + 0xe07bed (0x7f7ca2253bed in /data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/lib/libtorch_cuda.so) +frame #3: + 0xe62b3 (0x7f7c925a12b3 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6) +frame #4: + 0x94ac3 (0x7f7d011b4ac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #5: + 0x126850 (0x7f7d01246850 in /usr/lib/x86_64-linux-gnu/libc.so.6) + +[rank0]:[E426 20:18:38.749904924 ProcessGroupNCCL.cpp:1682] [PG ID 0 PG GUID 0(default_pg) Rank 0] Observed flight recorder dump signal from another rank via TCPStore. +[rank0]:[E426 20:18:38.750050860 ProcessGroupNCCL.cpp:1743] [PG ID 0 PG GUID 0(default_pg) Rank 0] Received a dump signal due to a collective timeout from rank 2 and we will try our best to dump the debug info. Last enqueued NCCL work: 47, last completed NCCL work: 47.This is most likely caused by incorrect usages of collectives, e.g., wrong sizes used across ranks, the order of collectives is not same for all ranks or the scheduled collective, for some reason, didn't run. Additionally, this can be caused by GIL deadlock or other reasons such as network errors or bugs in the communications library (e.g. NCCL), etc. +[rank0]:[E426 20:18:38.750152451 ProcessGroupNCCL.cpp:1533] [PG ID 0 PG GUID 0(default_pg) Rank 0] ProcessGroupNCCL preparing to dump debug info. Include stack trace: 1 +[2025-04-26 20:18:38,481] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 160.92 | bwd_microstep: 295.36 | bwd_inner_microstep: 295.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.07 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +W0426 20:18:38.748398 1560894 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1560986 closing signal SIGTERM +/data/diji/.conda/envs/internvl/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 21 leaked semaphore objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' +E0426 20:18:39.163626 1560894 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: -6) local_rank: 1 (pid: 1560987) of binary: /data/diji/.conda/envs/internvl/bin/python +Traceback (most recent call last): + File "/data/diji/.conda/envs/internvl/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==2.7.0', 'console_scripts', 'torchrun')()) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl_chat/internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2025-04-26_20:18:38 + host : nlp-in-477-l.soe.ucsc.edu + rank : 2 (local_rank: 2) + exitcode : -6 (pid: 1560988) + error_file: + traceback : Signal 6 (SIGABRT) received by PID 1560988 +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-04-26_20:18:38 + host : nlp-in-477-l.soe.ucsc.edu + rank : 1 (local_rank: 1) + exitcode : -6 (pid: 1560987) + error_file: + traceback : Signal 6 (SIGABRT) received by PID 1560987 +============================================================ +/data/diji/.conda/envs/internvl/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 20 leaked semaphore objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' +/data/diji/.conda/envs/internvl/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 20 leaked semaphore objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' +W0426 20:22:04.819303 1612021 site-packages/torch/distributed/run.py:766] +W0426 20:22:04.819303 1612021 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 20:22:04.819303 1612021 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 20:22:04.819303 1612021 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-26 20:22:06,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 20:22:06,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 20:22:06,089] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 20:22:09,027] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 20:22:09,027] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +04/26/2025 20:22:09 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 20:22:09 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=internvl_chat/zero_stage1_config.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=200, +evaluation_strategy=steps, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=32, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=False, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=4e-05, +length_column_name=length, +load_best_model_at_end=True, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/runs/Apr26_20-22-09_nlp-in-477-l.soe.ucsc.edu, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=eval_loss, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['wandb'], +resume_from_checkpoint=None, +run_name=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=steps, +save_total_limit=2, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.01, +) +04/26/2025 20:22:09 - INFO - __main__ - Loading Tokenizer: pretrained/InternVL2_5-2B +[INFO|tokenization_utils_base.py:2025] 2025-04-26 20:22:09,139 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2025-04-26 20:22:09,139 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 20:22:09,140 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 20:22:09,140 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 20:22:09,140 >> loading file tokenizer.json +[2025-04-26 20:22:09,221] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 20:22:09,237] [INFO] [comm.py:652:init_distributed] cdb=None +[WARNING|logging.py:314] 2025-04-26 20:22:09,250 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +04/26/2025 20:22:09 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 20:22:09 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 20:22:09 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:727] 2025-04-26 20:22:09,337 >> loading configuration file pretrained/InternVL2_5-2B/config.json +[INFO|configuration_utils.py:792] 2025-04-26 20:22:09,338 >> Model config InternVLChatConfig { + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 2048, + "llm_config": { + "_name_or_path": "internlm/internlm2_5-1_8b-chat", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "flash_attention_2", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForSequenceClassification": "modeling_internlm2.InternLM2ForSequenceClassification" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} + +04/26/2025 20:22:09 - INFO - __main__ - Using flash_attention_2 for InternLM +[INFO|modeling_utils.py:3473] 2025-04-26 20:22:09,339 >> loading weights file pretrained/InternVL2_5-2B/model.safetensors +[INFO|modeling_utils.py:1426] 2025-04-26 20:22:09,350 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:826] 2025-04-26 20:22:09,351 >> Generate config GenerationConfig {} + +[INFO|configuration_utils.py:826] 2025-04-26 20:22:09,379 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +[WARNING|logging.py:314] 2025-04-26 20:22:09,424 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2025-04-26 20:22:09,427 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|modeling_utils.py:4350] 2025-04-26 20:22:11,966 >> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4358] 2025-04-26 20:22:11,966 >> All the weights of InternVLChatModel were initialized from the model checkpoint at pretrained/InternVL2_5-2B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:779] 2025-04-26 20:22:11,969 >> loading configuration file pretrained/InternVL2_5-2B/generation_config.json +[INFO|configuration_utils.py:826] 2025-04-26 20:22:11,969 >> Generate config GenerationConfig { + "eos_token_id": [ + 92542, + 92543 + ] +} + +04/26/2025 20:22:12 - INFO - __main__ - Finished +04/26/2025 20:22:12 - INFO - __main__ - model.config.force_image_size: 448 +04/26/2025 20:22:12 - INFO - __main__ - data_args.force_image_size: 448 +04/26/2025 20:22:12 - INFO - __main__ - model.config.vision_config.image_size: 448 +04/26/2025 20:22:12 - INFO - __main__ - [Dataset] num_image_token: 256 +04/26/2025 20:22:12 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/26/2025 20:22:12 - INFO - __main__ - [Dataset] use_thumbnail: True +04/26/2025 20:22:12 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/26/2025 20:22:12 - INFO - __main__ - Formatting inputs...Skip in lazy mode +04/26/2025 20:22:12 - INFO - __main__ - Add dataset: bbox with length: 9967 +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.tok_embeddings.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.0.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.0.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.0.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.0.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.0.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.0.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.0.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.1.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.1.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.1.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.1.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.1.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.1.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.1.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.2.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.2.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.2.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.2.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.2.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.2.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.2.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.3.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.3.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.3.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.3.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.3.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.3.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.3.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.4.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.4.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.4.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.4.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.4.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.4.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.4.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.5.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.5.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.5.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.5.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.5.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.5.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.5.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.6.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.6.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.6.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.6.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.6.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.6.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.6.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.7.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.7.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.7.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.7.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.7.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.7.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.7.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.8.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.8.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.8.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.8.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.8.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.8.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.8.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.9.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.9.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.9.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.9.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.9.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.9.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.9.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.10.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.10.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.10.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.10.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.10.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.10.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.10.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.11.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.11.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.11.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.11.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.11.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.11.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.11.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.12.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.12.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.12.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.12.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.12.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.12.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.12.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.13.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.13.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.13.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.13.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.13.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.13.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.13.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.14.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.14.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.14.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.14.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.14.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.14.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.14.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.15.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.15.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.15.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.15.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.15.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.15.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.15.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.16.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.16.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.16.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.16.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.16.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.16.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.16.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.17.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.17.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.17.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.17.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.17.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.17.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.17.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.18.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.18.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.18.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.18.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.18.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.18.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.18.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.19.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.19.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.19.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.19.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.19.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.19.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.19.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.20.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.20.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.20.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.20.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.20.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.20.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.20.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.21.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.21.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.21.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.21.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.21.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.21.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.21.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.22.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.22.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.22.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.22.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.22.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.22.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.22.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.23.attention.wqkv.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.23.attention.wo.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.23.feed_forward.w1.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.23.feed_forward.w3.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.23.feed_forward.w2.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.23.attention_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.layers.23.ffn_norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.model.norm.weight +04/26/2025 20:22:12 - INFO - __main__ - language_model.output.weight +04/26/2025 20:22:12 - INFO - __main__ - mlp1.0.weight +04/26/2025 20:22:12 - INFO - __main__ - mlp1.0.bias +04/26/2025 20:22:12 - INFO - __main__ - mlp1.1.weight +04/26/2025 20:22:12 - INFO - __main__ - mlp1.1.bias +04/26/2025 20:22:12 - INFO - __main__ - mlp1.3.weight +04/26/2025 20:22:12 - INFO - __main__ - mlp1.3.bias +[INFO|trainer.py:571] 2025-04-26 20:22:12,208 >> Using auto half precision backend +[2025-04-26 20:22:12,398] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.15.4, git-hash=unknown, git-branch=unknown +[2025-04-26 20:22:12,398] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 3 +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +[2025-04-26 20:22:13,469] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file /data/diji/.cache/torch_extensions/py39_cu126/fused_adam/build.ninja... +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.32430052757263184 seconds +Loading extension module fused_adam... +Time to load fused_adam op: 0.40192079544067383 seconds +Loading extension module fused_adam... +Time to load fused_adam op: 0.40145349502563477 seconds +[2025-04-26 20:22:14,327] [INFO] [logging.py:128:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer +[2025-04-26 20:22:14,327] [INFO] [logging.py:128:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2025-04-26 20:22:14,333] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +[2025-04-26 20:22:14,333] [INFO] [utils.py:59:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[2025-04-26 20:22:14,333] [INFO] [logging.py:128:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 1 optimizer +[2025-04-26 20:22:14,333] [INFO] [stage_1_and_2.py:149:__init__] Reduce bucket size 1000000000 +[2025-04-26 20:22:14,333] [INFO] [stage_1_and_2.py:150:__init__] Allgather bucket size 1000000000 +[2025-04-26 20:22:14,333] [INFO] [stage_1_and_2.py:151:__init__] CPU Offload: False +[2025-04-26 20:22:14,333] [INFO] [stage_1_and_2.py:152:__init__] Round robin gradient partitioning: False +[2025-04-26 20:22:17,986] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states +[2025-04-26 20:22:17,986] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 8.03 GB CA 8.4 GB Max_CA 8 GB +[2025-04-26 20:22:17,987] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 78.05 GB, percent = 15.5% +[2025-04-26 20:22:18,141] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states +[2025-04-26 20:22:18,142] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 9.21 GB CA 10.76 GB Max_CA 11 GB +[2025-04-26 20:22:18,142] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 78.06 GB, percent = 15.5% +[2025-04-26 20:22:18,142] [INFO] [stage_1_and_2.py:544:__init__] optimizer state initialized +[2025-04-26 20:22:18,291] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer +[2025-04-26 20:22:18,291] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 6.85 GB CA 10.76 GB Max_CA 11 GB +[2025-04-26 20:22:18,291] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 78.06 GB, percent = 15.5% +[2025-04-26 20:22:18,292] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer +[2025-04-26 20:22:18,292] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler +[2025-04-26 20:22:18,293] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2025-04-26 20:22:18,293] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[[0.9, 0.999]] +[2025-04-26 20:22:18,294] [INFO] [config.py:999:print] DeepSpeedEngine configuration: +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] amp_enabled .................. False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] amp_params ................... False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] bfloat16_enabled ............. True +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] bfloat16_immediate_grad_update False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] comms_config ................. +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] communication_data_type ...... None +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] dataloader_drop_last ......... False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] disable_allgather ............ False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] dump_state ................... False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... None +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 +[2025-04-26 20:22:18,294] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] elasticity_enabled ........... False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] fp16_auto_cast ............... None +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] fp16_enabled ................. False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] global_rank .................. 0 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] grad_accum_dtype ............. None +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 32 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] gradient_clipping ............ 1.0 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] graph_harvesting ............. False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 1 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] load_universal_checkpoint .... False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] loss_scale ................... 1.0 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] memory_breakdown ............. False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] mics_shard_size .............. -1 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] optimizer_name ............... adamw +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] optimizer_params ............. {'lr': 4e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.01} +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] pld_enabled .................. False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] pld_params ................... False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] prescale_gradients ........... False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] scheduler_name ............... None +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] scheduler_params ............. None +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] seq_parallel_communication_data_type torch.float32 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] sparse_attention ............. None +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] steps_per_print .............. inf +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] train_batch_size ............. 96 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 1 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] use_node_local_storage ....... False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] wall_clock_breakdown ......... True +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] weight_quantization_config ... None +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] world_size ................... 3 +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=1000000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=1000000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] zero_enabled ................. True +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True +[2025-04-26 20:22:18,295] [INFO] [config.py:1003:print] zero_optimization_stage ...... 1 +[2025-04-26 20:22:18,295] [INFO] [config.py:989:print_user_config] json = { + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1.000000e+09, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1.000000e+09, + "contiguous_gradients": true + }, + "fp16": { + "enabled": false, + "auto_cast": true, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": true + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 4e-05, + "betas": [0.9, 0.999], + "eps": 1e-08, + "weight_decay": 0.01 + } + }, + "gradient_accumulation_steps": 32, + "gradient_clipping": 1.0, + "steps_per_print": inf, + "train_batch_size": 96, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": true +} +[INFO|trainer.py:1721] 2025-04-26 20:22:18,296 >> ***** Running training ***** +[INFO|trainer.py:1722] 2025-04-26 20:22:18,296 >> Num examples = 9,967 +[INFO|trainer.py:1723] 2025-04-26 20:22:18,296 >> Num Epochs = 3 +[INFO|trainer.py:1724] 2025-04-26 20:22:18,296 >> Instantaneous batch size per device = 1 +[INFO|trainer.py:1727] 2025-04-26 20:22:18,296 >> Total train batch size (w. parallel, distributed & accumulation) = 96 +[INFO|trainer.py:1728] 2025-04-26 20:22:18,296 >> Gradient Accumulation steps = 32 +[INFO|trainer.py:1729] 2025-04-26 20:22:18,296 >> Total optimization steps = 309 +[INFO|trainer.py:1730] 2025-04-26 20:22:18,297 >> Number of trainable parameters = 1,901,742,080 +[INFO|integration_utils.py:722] 2025-04-26 20:22:18,297 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" +wandb: Currently logged in as: dyang39 to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Tracking run with wandb version 0.19.10 +wandb: Run data is saved locally in /data/diji/InternVL/wandb/run-20250426_202218-ltudwj3u +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run easy-valley-132 +wandb: ⭐️ View project at https://wandb.ai/dyang39/huggingface +wandb: 🚀 View run at https://wandb.ai/dyang39/huggingface/runs/ltudwj3u + 0%| | 0/309 [00:00 +[rank1]: main() +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1057, in main +[rank1]: train_result = trainer.train(resume_from_checkpoint=checkpoint) +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1539, in train +[rank1]: return inner_training_loop( +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1929, in _inner_training_loop +[rank1]: self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 2291, in _maybe_log_save_evaluate +[rank1]: metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 3091, in evaluate +[rank1]: eval_dataloader = self.get_eval_dataloader(eval_dataset) +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 846, in get_eval_dataloader +[rank1]: raise ValueError("Trainer: evaluation requires an eval_dataset.") +[rank1]: ValueError: Trainer: evaluation requires an eval_dataset. +Traceback (most recent call last): + File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in + main() + File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1057, in main + train_result = trainer.train(resume_from_checkpoint=checkpoint) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1539, in train + return inner_training_loop( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1929, in _inner_training_loop + self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 2291, in _maybe_log_save_evaluate + metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 3091, in evaluate + eval_dataloader = self.get_eval_dataloader(eval_dataset) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 846, in get_eval_dataloader + raise ValueError("Trainer: evaluation requires an eval_dataset.") +ValueError: Trainer: evaluation requires an eval_dataset. +[rank0]: Traceback (most recent call last): +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank0]: main() +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1057, in main +[rank0]: train_result = trainer.train(resume_from_checkpoint=checkpoint) +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1539, in train +[rank0]: return inner_training_loop( +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1929, in _inner_training_loop +[rank0]: self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 2291, in _maybe_log_save_evaluate +[rank0]: metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 3091, in evaluate +[rank0]: eval_dataloader = self.get_eval_dataloader(eval_dataset) +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 846, in get_eval_dataloader +[rank0]: raise ValueError("Trainer: evaluation requires an eval_dataset.") +[rank0]: ValueError: Trainer: evaluation requires an eval_dataset. +[rank2]: Traceback (most recent call last): +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank2]: main() +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1057, in main +[rank2]: train_result = trainer.train(resume_from_checkpoint=checkpoint) +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1539, in train +[rank2]: return inner_training_loop( +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1929, in _inner_training_loop +[rank2]: self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 2291, in _maybe_log_save_evaluate +[rank2]: metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 3091, in evaluate +[rank2]: eval_dataloader = self.get_eval_dataloader(eval_dataset) +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 846, in get_eval_dataloader +[rank2]: raise ValueError("Trainer: evaluation requires an eval_dataset.") +[rank2]: ValueError: Trainer: evaluation requires an eval_dataset. +wandb: +wandb: 🚀 View run easy-valley-132 at: https://wandb.ai/dyang39/huggingface/runs/ltudwj3u +wandb: Find logs at: wandb/run-20250426_202218-ltudwj3u/logs +[rank0]:[W426 21:20:34.231275386 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +W0426 21:20:35.496060 1612021 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1612113 closing signal SIGTERM +W0426 21:20:35.496508 1612021 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1612115 closing signal SIGTERM +E0426 21:20:35.860965 1612021 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 1 (pid: 1612114) of binary: /data/diji/.conda/envs/internvl/bin/python +Traceback (most recent call last): + File "/data/diji/.conda/envs/internvl/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==2.7.0', 'console_scripts', 'torchrun')()) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl_chat/internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-04-26_21:20:35 + host : nlp-in-477-l.soe.ucsc.edu + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 1612114) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +W0426 21:38:41.940739 1730359 site-packages/torch/distributed/run.py:766] +W0426 21:38:41.940739 1730359 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 21:38:41.940739 1730359 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 21:38:41.940739 1730359 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-26 21:38:43,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 21:38:43,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 21:38:43,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 21:38:46,163] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 21:38:46,163] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[rank0]: Traceback (most recent call last): +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank0]: main() +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 815, in main +[rank0]: model_args, data_args, training_args = parser.parse_args_into_dataclasses() +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses +[rank0]: obj = dtype(**inputs) +[rank0]: File "", line 121, in __init__ +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/training_args.py", line 1378, in __post_init__ +[rank0]: raise ValueError( +[rank0]: ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found +[rank0]: - Evaluation strategy: no +[rank0]: - Save strategy: steps +[2025-04-26 21:38:46,317] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 21:38:46,317] [INFO] [comm.py:652:init_distributed] cdb=None +[rank2]: Traceback (most recent call last): +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank2]: main() +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 815, in main +[rank2]: model_args, data_args, training_args = parser.parse_args_into_dataclasses() +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses +[rank2]: obj = dtype(**inputs) +[rank2]: File "", line 121, in __init__ +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/training_args.py", line 1378, in __post_init__ +[rank2]: raise ValueError( +[rank2]: ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found +[rank2]: - Evaluation strategy: no +[rank2]: - Save strategy: steps +[rank1]: Traceback (most recent call last): +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank1]: main() +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 815, in main +[rank1]: model_args, data_args, training_args = parser.parse_args_into_dataclasses() +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses +[rank1]: obj = dtype(**inputs) +[rank1]: File "", line 121, in __init__ +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/training_args.py", line 1378, in __post_init__ +[rank1]: raise ValueError( +[rank1]: ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found +[rank1]: - Evaluation strategy: no +[rank1]: - Save strategy: steps +[rank0]:[W426 21:38:46.131314093 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +W0426 21:38:47.087176 1730359 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1730440 closing signal SIGTERM +W0426 21:38:47.087635 1730359 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1730441 closing signal SIGTERM +E0426 21:38:47.233193 1730359 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 1730439) of binary: /data/diji/.conda/envs/internvl/bin/python +Traceback (most recent call last): + File "/data/diji/.conda/envs/internvl/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==2.7.0', 'console_scripts', 'torchrun')()) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl_chat/internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-04-26_21:38:47 + host : nlp-in-477-l.soe.ucsc.edu + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 1730439) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +W0426 21:39:04.805497 1730999 site-packages/torch/distributed/run.py:766] +W0426 21:39:04.805497 1730999 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 21:39:04.805497 1730999 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 21:39:04.805497 1730999 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-26 21:39:06,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 21:39:06,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 21:39:06,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 21:39:08,978] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 21:39:08,978] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[rank0]: Traceback (most recent call last): +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank0]: main() +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 815, in main +[rank0]: model_args, data_args, training_args = parser.parse_args_into_dataclasses() +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses +[rank0]: obj = dtype(**inputs) +[rank0]: File "", line 121, in __init__ +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/training_args.py", line 1378, in __post_init__ +[rank0]: raise ValueError( +[rank0]: ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found +[rank0]: - Evaluation strategy: no +[rank0]: - Save strategy: steps +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 21:39:09,113] [INFO] [comm.py:652:init_distributed] cdb=None +[rank1]: Traceback (most recent call last): +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank1]: main() +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 815, in main +[rank1]: model_args, data_args, training_args = parser.parse_args_into_dataclasses() +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses +[rank1]: obj = dtype(**inputs) +[rank1]: File "", line 121, in __init__ +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/training_args.py", line 1378, in __post_init__ +[rank1]: raise ValueError( +[rank1]: ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found +[rank1]: - Evaluation strategy: no +[rank1]: - Save strategy: steps +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[rank0]:[W426 21:39:09.965942043 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +[2025-04-26 21:39:09,558] [INFO] [comm.py:652:init_distributed] cdb=None +[rank2]: Traceback (most recent call last): +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank2]: main() +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 815, in main +[rank2]: model_args, data_args, training_args = parser.parse_args_into_dataclasses() +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses +[rank2]: obj = dtype(**inputs) +[rank2]: File "", line 121, in __init__ +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/training_args.py", line 1378, in __post_init__ +[rank2]: raise ValueError( +[rank2]: ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found +[rank2]: - Evaluation strategy: no +[rank2]: - Save strategy: steps +W0426 21:39:09.950084 1730999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1731050 closing signal SIGTERM +W0426 21:39:09.950550 1730999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1731051 closing signal SIGTERM +E0426 21:39:10.067810 1730999 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 1731049) of binary: /data/diji/.conda/envs/internvl/bin/python +Traceback (most recent call last): + File "/data/diji/.conda/envs/internvl/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==2.7.0', 'console_scripts', 'torchrun')()) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl_chat/internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-04-26_21:39:09 + host : nlp-in-477-l.soe.ucsc.edu + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 1731049) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +W0426 21:41:12.631686 1733299 site-packages/torch/distributed/run.py:766] +W0426 21:41:12.631686 1733299 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 21:41:12.631686 1733299 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 21:41:12.631686 1733299 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-26 21:41:13,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 21:41:13,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 21:41:13,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 21:41:16,796] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 21:41:16,796] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +04/26/2025 21:41:16 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 21:41:16 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=internvl_chat/zero_stage1_config.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=32, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=None, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=4e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/runs/Apr26_21-41-16_nlp-in-477-l.soe.ucsc.edu, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['wandb'], +resume_from_checkpoint=None, +run_name=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=steps, +save_total_limit=2, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.01, +) +04/26/2025 21:41:16 - INFO - __main__ - Loading Tokenizer: pretrained/InternVL2_5-2B +[INFO|tokenization_utils_base.py:2025] 2025-04-26 21:41:16,944 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2025-04-26 21:41:16,944 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 21:41:16,944 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 21:41:16,944 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 21:41:16,944 >> loading file tokenizer.json +[2025-04-26 21:41:16,997] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 21:41:16,997] [INFO] [comm.py:652:init_distributed] cdb=None +[WARNING|logging.py:314] 2025-04-26 21:41:17,054 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +04/26/2025 21:41:17 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 21:41:17 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 21:41:17 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:727] 2025-04-26 21:41:17,141 >> loading configuration file pretrained/InternVL2_5-2B/config.json +[INFO|configuration_utils.py:792] 2025-04-26 21:41:17,142 >> Model config InternVLChatConfig { + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 2048, + "llm_config": { + "_name_or_path": "internlm/internlm2_5-1_8b-chat", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "flash_attention_2", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForSequenceClassification": "modeling_internlm2.InternLM2ForSequenceClassification" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} + +04/26/2025 21:41:17 - INFO - __main__ - Using flash_attention_2 for InternLM +[INFO|modeling_utils.py:3473] 2025-04-26 21:41:17,143 >> loading weights file pretrained/InternVL2_5-2B/model.safetensors +[INFO|modeling_utils.py:1426] 2025-04-26 21:41:17,154 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:826] 2025-04-26 21:41:17,154 >> Generate config GenerationConfig {} + +[INFO|configuration_utils.py:826] 2025-04-26 21:41:17,182 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +[WARNING|logging.py:314] 2025-04-26 21:41:17,188 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2025-04-26 21:41:17,204 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|modeling_utils.py:4350] 2025-04-26 21:41:19,770 >> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4358] 2025-04-26 21:41:19,770 >> All the weights of InternVLChatModel were initialized from the model checkpoint at pretrained/InternVL2_5-2B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:779] 2025-04-26 21:41:19,772 >> loading configuration file pretrained/InternVL2_5-2B/generation_config.json +[INFO|configuration_utils.py:826] 2025-04-26 21:41:19,773 >> Generate config GenerationConfig { + "eos_token_id": [ + 92542, + 92543 + ] +} + +04/26/2025 21:41:19 - INFO - __main__ - Finished +04/26/2025 21:41:19 - INFO - __main__ - model.config.force_image_size: 448 +04/26/2025 21:41:19 - INFO - __main__ - data_args.force_image_size: 448 +04/26/2025 21:41:19 - INFO - __main__ - model.config.vision_config.image_size: 448 +04/26/2025 21:41:19 - INFO - __main__ - [Dataset] num_image_token: 256 +04/26/2025 21:41:19 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/26/2025 21:41:19 - INFO - __main__ - [Dataset] use_thumbnail: True +04/26/2025 21:41:19 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/26/2025 21:41:19 - INFO - __main__ - Formatting inputs...Skip in lazy mode +04/26/2025 21:41:20 - INFO - __main__ - Add dataset: bbox with length: 9967 +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.tok_embeddings.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.0.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.0.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.0.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.0.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.0.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.0.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.0.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.1.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.1.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.1.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.1.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.1.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.1.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.1.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.2.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.2.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.2.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.2.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.2.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.2.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.2.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.3.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.3.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.3.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.3.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.3.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.3.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.3.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.4.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.4.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.4.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.4.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.4.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.4.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.4.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.5.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.5.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.5.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.5.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.5.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.5.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.5.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.6.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.6.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.6.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.6.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.6.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.6.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.6.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.7.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.7.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.7.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.7.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.7.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.7.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.7.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.8.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.8.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.8.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.8.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.8.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.8.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.8.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.9.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.9.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.9.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.9.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.9.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.9.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.9.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.10.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.10.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.10.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.10.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.10.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.10.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.10.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.11.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.11.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.11.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.11.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.11.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.11.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.11.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.12.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.12.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.12.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.12.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.12.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.12.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.12.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.13.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.13.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.13.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.13.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.13.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.13.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.13.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.14.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.14.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.14.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.14.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.14.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.14.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.14.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.15.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.15.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.15.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.15.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.15.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.15.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.15.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.16.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.16.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.16.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.16.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.16.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.16.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.16.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.17.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.17.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.17.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.17.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.17.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.17.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.17.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.18.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.18.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.18.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.18.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.18.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.18.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.18.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.19.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.19.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.19.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.19.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.19.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.19.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.19.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.20.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.20.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.20.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.20.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.20.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.20.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.20.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.21.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.21.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.21.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.21.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.21.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.21.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.21.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.22.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.22.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.22.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.22.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.22.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.22.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.22.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.23.attention.wqkv.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.23.attention.wo.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.23.feed_forward.w1.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.23.feed_forward.w3.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.23.feed_forward.w2.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.23.attention_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.layers.23.ffn_norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.model.norm.weight +04/26/2025 21:41:20 - INFO - __main__ - language_model.output.weight +04/26/2025 21:41:20 - INFO - __main__ - mlp1.0.weight +04/26/2025 21:41:20 - INFO - __main__ - mlp1.0.bias +04/26/2025 21:41:20 - INFO - __main__ - mlp1.1.weight +04/26/2025 21:41:20 - INFO - __main__ - mlp1.1.bias +04/26/2025 21:41:20 - INFO - __main__ - mlp1.3.weight +04/26/2025 21:41:20 - INFO - __main__ - mlp1.3.bias +[INFO|trainer.py:571] 2025-04-26 21:41:20,010 >> Using auto half precision backend +[2025-04-26 21:41:20,199] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.15.4, git-hash=unknown, git-branch=unknown +[2025-04-26 21:41:20,199] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 3 +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +[2025-04-26 21:41:21,303] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file /data/diji/.cache/torch_extensions/py39_cu126/fused_adam/build.ninja... +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.34974217414855957 seconds +Loading extension module fused_adam... +Time to load fused_adam op: 0.40183234214782715 seconds +Loading extension module fused_adam... +Time to load fused_adam op: 0.4019894599914551 seconds +[2025-04-26 21:41:22,152] [INFO] [logging.py:128:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer +[2025-04-26 21:41:22,152] [INFO] [logging.py:128:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2025-04-26 21:41:22,158] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +[2025-04-26 21:41:22,158] [INFO] [utils.py:59:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[2025-04-26 21:41:22,158] [INFO] [logging.py:128:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 1 optimizer +[2025-04-26 21:41:22,159] [INFO] [stage_1_and_2.py:149:__init__] Reduce bucket size 1000000000 +[2025-04-26 21:41:22,159] [INFO] [stage_1_and_2.py:150:__init__] Allgather bucket size 1000000000 +[2025-04-26 21:41:22,159] [INFO] [stage_1_and_2.py:151:__init__] CPU Offload: False +[2025-04-26 21:41:22,159] [INFO] [stage_1_and_2.py:152:__init__] Round robin gradient partitioning: False +[2025-04-26 21:41:25,790] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states +[2025-04-26 21:41:25,791] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 8.03 GB CA 8.4 GB Max_CA 8 GB +[2025-04-26 21:41:25,791] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 74.54 GB, percent = 14.8% +[2025-04-26 21:41:25,951] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states +[2025-04-26 21:41:25,952] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 9.21 GB CA 10.76 GB Max_CA 11 GB +[2025-04-26 21:41:25,952] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 74.54 GB, percent = 14.8% +[2025-04-26 21:41:25,952] [INFO] [stage_1_and_2.py:544:__init__] optimizer state initialized +[2025-04-26 21:41:26,103] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer +[2025-04-26 21:41:26,104] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 6.85 GB CA 10.76 GB Max_CA 11 GB +[2025-04-26 21:41:26,104] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 74.54 GB, percent = 14.8% +[2025-04-26 21:41:26,105] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer +[2025-04-26 21:41:26,105] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler +[2025-04-26 21:41:26,105] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2025-04-26 21:41:26,105] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[[0.9, 0.999]] +[2025-04-26 21:41:26,106] [INFO] [config.py:999:print] DeepSpeedEngine configuration: +[2025-04-26 21:41:26,106] [INFO] [config.py:1003:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-04-26 21:41:26,106] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2025-04-26 21:41:26,106] [INFO] [config.py:1003:print] amp_enabled .................. False +[2025-04-26 21:41:26,106] [INFO] [config.py:1003:print] amp_params ................... False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] bfloat16_enabled ............. True +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] bfloat16_immediate_grad_update False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] comms_config ................. +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] communication_data_type ...... None +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] dataloader_drop_last ......... False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] disable_allgather ............ False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] dump_state ................... False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... None +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] elasticity_enabled ........... False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] fp16_auto_cast ............... None +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] fp16_enabled ................. False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] global_rank .................. 0 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] grad_accum_dtype ............. None +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 32 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] gradient_clipping ............ 1.0 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] graph_harvesting ............. False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 1 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] load_universal_checkpoint .... False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] loss_scale ................... 1.0 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] memory_breakdown ............. False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] mics_shard_size .............. -1 +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False +[2025-04-26 21:41:26,107] [INFO] [config.py:1003:print] optimizer_name ............... adamw +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] optimizer_params ............. {'lr': 4e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.01} +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] pld_enabled .................. False +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] pld_params ................... False +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] prescale_gradients ........... False +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] scheduler_name ............... None +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] scheduler_params ............. None +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] seq_parallel_communication_data_type torch.float32 +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] sparse_attention ............. None +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] steps_per_print .............. inf +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] train_batch_size ............. 96 +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 1 +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] use_node_local_storage ....... False +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] wall_clock_breakdown ......... True +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] weight_quantization_config ... None +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] world_size ................... 3 +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=1000000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=1000000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] zero_enabled ................. True +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True +[2025-04-26 21:41:26,108] [INFO] [config.py:1003:print] zero_optimization_stage ...... 1 +[2025-04-26 21:41:26,108] [INFO] [config.py:989:print_user_config] json = { + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1.000000e+09, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1.000000e+09, + "contiguous_gradients": true + }, + "fp16": { + "enabled": false, + "auto_cast": true, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": true + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 4e-05, + "betas": [0.9, 0.999], + "eps": 1e-08, + "weight_decay": 0.01 + } + }, + "gradient_accumulation_steps": 32, + "gradient_clipping": 1.0, + "steps_per_print": inf, + "train_batch_size": 96, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": true +} +[INFO|trainer.py:1721] 2025-04-26 21:41:26,108 >> ***** Running training ***** +[INFO|trainer.py:1722] 2025-04-26 21:41:26,108 >> Num examples = 9,967 +[INFO|trainer.py:1723] 2025-04-26 21:41:26,108 >> Num Epochs = 3 +[INFO|trainer.py:1724] 2025-04-26 21:41:26,108 >> Instantaneous batch size per device = 1 +[INFO|trainer.py:1727] 2025-04-26 21:41:26,108 >> Total train batch size (w. parallel, distributed & accumulation) = 96 +[INFO|trainer.py:1728] 2025-04-26 21:41:26,108 >> Gradient Accumulation steps = 32 +[INFO|trainer.py:1729] 2025-04-26 21:41:26,108 >> Total optimization steps = 309 +[INFO|trainer.py:1730] 2025-04-26 21:41:26,109 >> Number of trainable parameters = 1,901,742,080 +[INFO|integration_utils.py:722] 2025-04-26 21:41:26,110 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" +wandb: Currently logged in as: dyang39 to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Tracking run with wandb version 0.19.10 +wandb: Run data is saved locally in /data/diji/InternVL/wandb/run-20250426_214126-j09wcelk +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run divine-snowball-133 +wandb: ⭐️ View project at https://wandb.ai/dyang39/huggingface +wandb: 🚀 View run at https://wandb.ai/dyang39/huggingface/runs/j09wcelk + 0%| | 0/309 [00:00> Saving model checkpoint to work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200 +[INFO|configuration_utils.py:473] 2025-04-26 22:39:52,058 >> Configuration saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/config.json +[INFO|configuration_utils.py:594] 2025-04-26 22:39:52,058 >> Configuration saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/generation_config.json +[INFO|modeling_utils.py:2493] 2025-04-26 22:39:57,129 >> Model weights saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/model.safetensors +[INFO|tokenization_utils_base.py:2433] 2025-04-26 22:39:57,130 >> tokenizer config file saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/tokenizer_config.json +[INFO|tokenization_utils_base.py:2442] 2025-04-26 22:39:57,131 >> Special tokens file saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/special_tokens_map.json +[INFO|tokenization_utils_base.py:2493] 2025-04-26 22:39:57,131 >> added tokens file saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/added_tokens.json +[2025-04-26 22:39:57,295] [INFO] [logging.py:128:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved! +[2025-04-26 22:39:57,327] [INFO] [logging.py:128:log_dist] [Rank 0] Saving model checkpoint: work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/global_step200/mp_rank_00_model_states.pt +[2025-04-26 22:39:57,327] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/global_step200/mp_rank_00_model_states.pt... +[2025-04-26 22:40:03,129] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/global_step200/mp_rank_00_model_states.pt. +[2025-04-26 22:40:03,130] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-04-26 22:40:14,804] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-04-26 22:40:14,805] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tmp-checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-04-26 22:40:14,805] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now! +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:40:15,351] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 166.23 | bwd_microstep: 289.16 | bwd_inner_microstep: 289.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:40:15,830] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.90 | bwd_microstep: 305.92 | bwd_inner_microstep: 305.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:40:16,308] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.36 | bwd_microstep: 305.42 | bwd_inner_microstep: 305.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:40:16,436] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.95 | bwd_microstep: 85.29 | bwd_inner_microstep: 85.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:40:16,912] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 166.46 | bwd_microstep: 305.76 | bwd_inner_microstep: 305.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:40:17,391] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.28 | bwd_microstep: 305.52 | bwd_inner_microstep: 305.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:40:17,869] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.17 | bwd_microstep: 305.26 | bwd_inner_microstep: 305.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:40:18,348] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.70 | bwd_microstep: 306.21 | bwd_inner_microstep: 306.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:40:18,827] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.15 | bwd_microstep: 305.52 | bwd_inner_microstep: 305.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:19,305] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.20 | bwd_microstep: 305.73 | bwd_inner_microstep: 305.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:19,784] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.55 | bwd_microstep: 305.56 | bwd_inner_microstep: 305.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:20,263] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.52 | bwd_microstep: 305.94 | bwd_inner_microstep: 305.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:40:20,742] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 305.67 | bwd_inner_microstep: 305.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1399 +[2025-04-26 22:40:21,110] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 126.50 | bwd_microstep: 236.49 | bwd_inner_microstep: 236.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:40:21,588] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.84 | bwd_microstep: 305.99 | bwd_inner_microstep: 305.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:40:22,068] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.38 | bwd_microstep: 306.80 | bwd_inner_microstep: 306.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:40:22,547] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.95 | bwd_microstep: 305.83 | bwd_inner_microstep: 305.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:23,028] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.54 | bwd_microstep: 306.39 | bwd_inner_microstep: 306.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:23,509] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 306.17 | bwd_inner_microstep: 306.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:40:23,989] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.83 | bwd_microstep: 306.25 | bwd_inner_microstep: 306.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:24,469] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.36 | bwd_microstep: 306.92 | bwd_inner_microstep: 306.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:40:24,950] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.88 | bwd_microstep: 306.80 | bwd_inner_microstep: 306.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:40:25,431] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 306.77 | bwd_inner_microstep: 306.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:40:25,913] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 307.20 | bwd_inner_microstep: 307.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:40:26,393] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 306.15 | bwd_inner_microstep: 306.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:40:26,874] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 306.55 | bwd_inner_microstep: 306.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:40:27,354] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 306.16 | bwd_inner_microstep: 306.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 22:40:27,854] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.45 | bwd_microstep: 320.27 | bwd_inner_microstep: 320.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:40:28,338] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 308.51 | bwd_inner_microstep: 308.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 22:40:28,838] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.55 | bwd_microstep: 319.60 | bwd_inner_microstep: 319.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:40:29,321] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 307.87 | bwd_inner_microstep: 307.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:40:31,775] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.69 | optimizer_gradients: 17.50 | optimizer_step: 32.03 +[2025-04-26 22:40:31,776] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 1010.36 | bwd_inner_microstep: 621.33 | bwd_allreduce_microstep: 388.99 | step_microstep: 1269.37 +[2025-04-26 22:40:31,777] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5245.36 | bwd: 10224.06 | bwd_inner: 9834.61 | bwd_allreduce: 389.11 | step: 1270.38 + 65%|██████▌ | 201/309 [59:04<44:06, 24.50s/it] {'loss': 0.1715, 'learning_rate': 1.1552925444480674e-05, 'epoch': 1.94} + 65%|██████▌ | 201/309 [59:04<44:06, 24.50s/it]dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1405 +[2025-04-26 22:40:32,131] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 125.72 | bwd_microstep: 221.48 | bwd_inner_microstep: 221.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:40:32,614] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 307.89 | bwd_inner_microstep: 307.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:40:33,098] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 308.27 | bwd_inner_microstep: 308.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:40:33,227] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.30 | bwd_microstep: 85.73 | bwd_inner_microstep: 85.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:40:33,708] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.57 | bwd_microstep: 309.27 | bwd_inner_microstep: 309.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:40:34,192] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 308.85 | bwd_inner_microstep: 308.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:40:34,674] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 308.04 | bwd_inner_microstep: 308.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:35,159] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 308.69 | bwd_inner_microstep: 308.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:35,643] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 308.69 | bwd_inner_microstep: 308.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:40:36,127] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 308.19 | bwd_inner_microstep: 308.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:40:36,611] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 309.33 | bwd_inner_microstep: 309.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:40:37,095] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.97 | bwd_microstep: 308.59 | bwd_inner_microstep: 308.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:40:37,578] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 308.55 | bwd_inner_microstep: 308.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:40:38,062] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 308.05 | bwd_inner_microstep: 308.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 887 +[2025-04-26 22:40:38,314] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.93 | bwd_microstep: 162.09 | bwd_inner_microstep: 162.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:40:38,798] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 309.72 | bwd_inner_microstep: 309.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:39,282] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 309.45 | bwd_inner_microstep: 309.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:40:39,766] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 308.12 | bwd_inner_microstep: 308.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:40:40,251] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 308.99 | bwd_inner_microstep: 308.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:40:40,737] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 309.39 | bwd_inner_microstep: 309.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:40:41,221] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 308.77 | bwd_inner_microstep: 308.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 369 +[2025-04-26 22:40:41,351] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.70 | bwd_microstep: 85.60 | bwd_inner_microstep: 85.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:40:41,835] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 309.69 | bwd_inner_microstep: 309.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:42,320] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 309.76 | bwd_inner_microstep: 309.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 22:40:42,449] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.86 | bwd_microstep: 85.20 | bwd_inner_microstep: 85.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:40:42,931] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.05 | bwd_microstep: 308.18 | bwd_inner_microstep: 308.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:40:43,415] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 308.52 | bwd_inner_microstep: 308.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1950 +[2025-04-26 22:40:43,919] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.38 | bwd_microstep: 323.18 | bwd_inner_microstep: 323.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:40:44,406] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 22:40:44,909] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.87 | bwd_microstep: 321.52 | bwd_inner_microstep: 321.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:40:45,395] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:40:48,875] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1205.22 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 22:40:48,875] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 2030.79 | bwd_inner_microstep: 337.22 | bwd_allreduce_microstep: 1693.53 | step_microstep: 1273.09 +[2025-04-26 22:40:48,877] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4945.72 | bwd: 10729.31 | bwd_inner: 9035.32 | bwd_allreduce: 1693.65 | step: 1274.20 + 65%|██████▌ | 202/309 [59:21<39:44, 22.28s/it] {'loss': 0.2646, 'learning_rate': 1.1362917801666895e-05, 'epoch': 1.95} + 65%|██████▌ | 202/309 [59:21<39:44, 22.28s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:40:49,349] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 295.51 | bwd_inner_microstep: 295.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:40:49,835] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 309.99 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:40:50,321] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 310.12 | bwd_inner_microstep: 310.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:40:50,808] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 309.83 | bwd_inner_microstep: 309.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:40:51,295] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 310.19 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:40:51,782] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:40:52,269] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 309.69 | bwd_inner_microstep: 309.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:52,755] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 309.36 | bwd_inner_microstep: 309.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:40:53,240] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 309.30 | bwd_inner_microstep: 309.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:53,727] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 309.52 | bwd_inner_microstep: 309.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:54,214] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 309.71 | bwd_inner_microstep: 309.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:40:54,345] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.92 | bwd_microstep: 86.35 | bwd_inner_microstep: 86.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 886 +[2025-04-26 22:40:54,596] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 84.86 | bwd_microstep: 162.00 | bwd_inner_microstep: 161.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:55,082] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 310.33 | bwd_inner_microstep: 310.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:40:55,570] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:40:56,056] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:40:56,543] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 309.86 | bwd_inner_microstep: 309.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:40:57,031] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 310.93 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:40:57,516] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 310.03 | bwd_inner_microstep: 310.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:40:58,002] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:40:58,132] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.92 | bwd_microstep: 85.79 | bwd_inner_microstep: 85.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:40:58,617] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 311.46 | bwd_inner_microstep: 311.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:40:58,747] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.50 | bwd_microstep: 85.83 | bwd_inner_microstep: 85.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 358 +[2025-04-26 22:40:58,874] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.90 | bwd_microstep: 85.02 | bwd_inner_microstep: 85.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:40:59,358] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 308.61 | bwd_inner_microstep: 308.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 357 +[2025-04-26 22:40:59,486] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.64 | bwd_microstep: 85.07 | bwd_inner_microstep: 85.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:40:59,972] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 309.63 | bwd_inner_microstep: 309.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 22:41:00,477] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.22 | bwd_microstep: 324.11 | bwd_inner_microstep: 324.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 22:41:00,985] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.15 | bwd_microstep: 323.41 | bwd_inner_microstep: 323.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:41:01,475] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.59 | bwd_microstep: 311.13 | bwd_inner_microstep: 311.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:41:01,964] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 311.46 | bwd_inner_microstep: 311.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:41:05,565] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.17 | optimizer_gradients: 17.56 | optimizer_step: 32.04 +[2025-04-26 22:41:05,566] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 2153.37 | bwd_inner_microstep: 339.18 | bwd_allreduce_microstep: 1814.14 | step_microstep: 1270.15 +[2025-04-26 22:41:05,567] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4754.08 | bwd: 10508.19 | bwd_inner: 8693.58 | bwd_allreduce: 1814.26 | step: 1271.37 + 66%|██████▌ | 203/309 [59:38<36:24, 20.60s/it] {'loss': 0.2298, 'learning_rate': 1.11738636582575e-05, 'epoch': 1.95} + 66%|██████▌ | 203/309 [59:38<36:24, 20.60s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:41:06,040] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 294.76 | bwd_inner_microstep: 294.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:41:06,529] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 311.56 | bwd_inner_microstep: 311.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:41:07,016] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:41:07,505] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 311.30 | bwd_inner_microstep: 311.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:41:07,995] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.90 | bwd_microstep: 311.58 | bwd_inner_microstep: 311.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:08,482] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:41:08,971] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:41:09,459] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:41:09,948] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:41:10,438] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:10,926] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.42 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:41:11,415] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.43 | bwd_microstep: 310.95 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:41:11,547] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.22 | bwd_microstep: 86.74 | bwd_inner_microstep: 86.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:41:12,035] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 312.08 | bwd_inner_microstep: 312.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:41:12,523] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 311.50 | bwd_inner_microstep: 311.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:13,012] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.83 | bwd_microstep: 311.28 | bwd_inner_microstep: 311.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 367 +[2025-04-26 22:41:13,144] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.39 | bwd_microstep: 86.65 | bwd_inner_microstep: 86.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:41:13,631] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:14,120] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 311.75 | bwd_inner_microstep: 311.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:14,609] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.14 | bwd_microstep: 311.33 | bwd_inner_microstep: 311.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:41:15,097] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:15,587] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.14 | bwd_microstep: 311.39 | bwd_inner_microstep: 311.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:41:15,718] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.98 | bwd_microstep: 86.96 | bwd_inner_microstep: 86.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:41:16,204] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:41:16,337] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.89 | bwd_microstep: 87.53 | bwd_inner_microstep: 87.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:41:16,824] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 311.43 | bwd_inner_microstep: 311.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2202 +[2025-04-26 22:41:17,377] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.75 | bwd_microstep: 361.99 | bwd_inner_microstep: 361.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1941 +[2025-04-26 22:41:17,885] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.55 | bwd_microstep: 324.64 | bwd_inner_microstep: 324.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:41:18,374] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 313.06 | bwd_inner_microstep: 313.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 22:41:18,882] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.35 | bwd_microstep: 324.77 | bwd_inner_microstep: 324.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:41:19,016] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.71 | bwd_microstep: 89.18 | bwd_inner_microstep: 89.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:41:23,292] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1205.81 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 22:41:23,293] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 2826.24 | bwd_inner_microstep: 338.22 | bwd_allreduce_microstep: 2487.98 | step_microstep: 1273.68 +[2025-04-26 22:41:23,294] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4875.26 | bwd: 11414.59 | bwd_inner: 8926.15 | bwd_allreduce: 2488.09 | step: 1274.85 + 66%|██████▌ | 204/309 [59:55<34:32, 19.74s/it] {'loss': 0.282, 'learning_rate': 1.0985783885075407e-05, 'epoch': 1.96} + 66%|██████▌ | 204/309 [59:55<34:32, 19.74s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 22:41:23,782] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.12 | bwd_microstep: 306.18 | bwd_inner_microstep: 306.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:41:24,352] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:41:24,840] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 312.22 | bwd_inner_microstep: 312.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:41:25,328] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:41:25,817] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1400 +[2025-04-26 22:41:26,190] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.91 | bwd_microstep: 239.91 | bwd_inner_microstep: 239.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:41:26,677] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 310.93 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:41:27,165] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 311.28 | bwd_inner_microstep: 311.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:27,653] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:41:28,140] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:41:28,628] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:41:29,116] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1399 +[2025-04-26 22:41:29,490] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.57 | bwd_microstep: 240.50 | bwd_inner_microstep: 240.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:41:29,978] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 311.61 | bwd_inner_microstep: 311.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:41:30,109] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.47 | bwd_microstep: 86.81 | bwd_inner_microstep: 86.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:41:30,237] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.39 | bwd_microstep: 85.93 | bwd_inner_microstep: 85.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:41:30,725] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 311.63 | bwd_inner_microstep: 311.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:41:30,856] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.90 | bwd_microstep: 86.87 | bwd_inner_microstep: 86.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:31,343] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 311.69 | bwd_inner_microstep: 311.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:41:31,831] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 311.90 | bwd_inner_microstep: 311.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:41:32,319] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:41:32,806] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 309.76 | bwd_inner_microstep: 309.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:41:33,295] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.64 | bwd_microstep: 310.84 | bwd_inner_microstep: 310.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:41:33,784] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:41:34,272] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 310.74 | bwd_inner_microstep: 310.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:41:34,758] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 310.01 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1962 +[2025-04-26 22:41:35,270] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.56 | bwd_microstep: 327.36 | bwd_inner_microstep: 327.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 22:41:35,777] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.55 | bwd_microstep: 324.25 | bwd_inner_microstep: 324.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 387 +[2025-04-26 22:41:35,918] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.62 | bwd_microstep: 94.26 | bwd_inner_microstep: 94.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:41:36,048] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.93 | bwd_microstep: 87.97 | bwd_inner_microstep: 87.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:41:36,536] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 312.17 | bwd_inner_microstep: 312.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:41:40,340] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.44 | optimizer_gradients: 17.53 | optimizer_step: 32.05 +[2025-04-26 22:41:40,340] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 2357.05 | bwd_inner_microstep: 338.91 | bwd_allreduce_microstep: 2018.11 | step_microstep: 1270.35 +[2025-04-26 22:41:40,342] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4776.75 | bwd: 10769.07 | bwd_inner: 8750.50 | bwd_allreduce: 2018.23 | step: 1271.36 + 66%|██████▋ | 205/309 [1:00:12<32:49, 18.93s/it] {'loss': 0.3746, 'learning_rate': 1.0798699245376959e-05, 'epoch': 1.97} + 66%|██████▋ | 205/309 [1:00:12<32:49, 18.93s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:41:40,813] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 294.75 | bwd_inner_microstep: 294.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:41:41,302] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 312.85 | bwd_inner_microstep: 312.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.06 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:41:41,795] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:41:42,283] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:41:42,771] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:41:43,260] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.12 | bwd_microstep: 310.75 | bwd_inner_microstep: 310.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:41:43,749] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.73 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:41:44,239] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.35 | bwd_microstep: 311.74 | bwd_inner_microstep: 311.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:41:44,728] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 311.13 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:41:45,216] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 311.38 | bwd_inner_microstep: 311.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:41:45,705] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 310.97 | bwd_inner_microstep: 310.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:41:46,192] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:41:46,682] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.34 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:41:47,170] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:47,659] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 311.02 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:48,147] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:41:48,637] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 311.46 | bwd_inner_microstep: 311.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:49,126] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 311.22 | bwd_inner_microstep: 311.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:49,615] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.95 | bwd_microstep: 311.44 | bwd_inner_microstep: 311.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:50,105] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 311.72 | bwd_inner_microstep: 311.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:41:50,593] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 311.47 | bwd_inner_microstep: 311.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:41:51,082] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 310.91 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:41:51,570] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.27 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 22:41:51,700] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.08 | bwd_microstep: 86.06 | bwd_inner_microstep: 86.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:41:52,187] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:41:52,676] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:41:53,163] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1952 +[2025-04-26 22:41:53,673] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.44 | bwd_microstep: 325.91 | bwd_inner_microstep: 325.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 916 +[2025-04-26 22:41:53,941] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 91.31 | bwd_microstep: 172.65 | bwd_inner_microstep: 172.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:41:54,429] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 313.51 | bwd_inner_microstep: 313.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:41:54,918] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 311.82 | bwd_inner_microstep: 311.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:41:57,381] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1205.34 | optimizer_gradients: 17.53 | optimizer_step: 32.04 +[2025-04-26 22:41:57,382] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 1013.39 | bwd_inner_microstep: 624.05 | bwd_allreduce_microstep: 389.30 | step_microstep: 1273.19 +[2025-04-26 22:41:57,383] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5304.29 | bwd: 10295.84 | bwd_inner: 9906.09 | bwd_allreduce: 389.41 | step: 1274.34 + 67%|██████▋ | 206/309 [1:00:30<31:31, 18.37s/it] {'loss': 0.318, 'learning_rate': 1.0612630392559728e-05, 'epoch': 1.98} + 67%|██████▋ | 206/309 [1:00:30<31:31, 18.37s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:41:57,856] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 295.23 | bwd_inner_microstep: 295.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:41:58,345] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 312.50 | bwd_inner_microstep: 312.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:41:58,834] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 311.54 | bwd_inner_microstep: 311.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:41:59,321] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:41:59,810] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 311.09 | bwd_inner_microstep: 311.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:42:00,299] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:42:00,788] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 311.31 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:42:01,277] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 311.32 | bwd_inner_microstep: 311.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:42:01,408] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.11 | bwd_microstep: 86.80 | bwd_inner_microstep: 86.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:01,896] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 312.14 | bwd_inner_microstep: 312.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:42:02,027] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.46 | bwd_microstep: 86.87 | bwd_inner_microstep: 86.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:02,514] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 311.25 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:42:03,003] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 312.67 | bwd_inner_microstep: 312.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:42:03,492] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:42:03,979] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:42:04,469] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.24 | bwd_microstep: 311.84 | bwd_inner_microstep: 311.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:04,958] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 310.93 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:42:05,448] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 311.91 | bwd_inner_microstep: 311.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:42:05,939] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 311.28 | bwd_inner_microstep: 311.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:42:06,428] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 311.75 | bwd_inner_microstep: 311.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:42:06,918] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.83 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.08 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:42:07,406] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:42:07,537] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.12 | bwd_microstep: 86.98 | bwd_inner_microstep: 86.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:42:08,024] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:42:08,511] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:42:08,999] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1974 +[2025-04-26 22:42:09,510] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.86 | bwd_microstep: 327.73 | bwd_inner_microstep: 327.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1940 +[2025-04-26 22:42:10,019] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.69 | bwd_microstep: 324.98 | bwd_inner_microstep: 324.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:42:10,509] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.15 | bwd_microstep: 312.13 | bwd_inner_microstep: 312.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:42:10,998] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 311.89 | bwd_inner_microstep: 311.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 22:42:11,504] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.02 | bwd_microstep: 323.35 | bwd_inner_microstep: 323.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:42:14,851] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.76 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 22:42:14,851] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 1900.17 | bwd_inner_microstep: 339.40 | bwd_allreduce_microstep: 1560.73 | step_microstep: 1269.69 +[2025-04-26 22:42:14,853] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5134.45 | bwd: 10905.38 | bwd_inner: 9344.18 | bwd_allreduce: 1560.86 | step: 1270.89 + 67%|██████▋ | 207/309 [1:00:47<30:45, 18.10s/it] {'loss': 0.2953, 'learning_rate': 1.0427597867882474e-05, 'epoch': 1.99} + 67%|██████▋ | 207/309 [1:00:47<30:45, 18.10s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:42:15,326] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 296.24 | bwd_inner_microstep: 296.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:42:15,815] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:42:16,302] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:42:16,792] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.64 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:42:17,280] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 310.58 | bwd_inner_microstep: 310.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 22:42:17,411] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.03 | bwd_microstep: 86.42 | bwd_inner_microstep: 86.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 22:42:17,540] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.02 | bwd_microstep: 87.16 | bwd_inner_microstep: 87.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:18,028] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 312.07 | bwd_inner_microstep: 312.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:42:18,516] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 311.23 | bwd_inner_microstep: 311.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:19,004] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:42:19,492] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 310.05 | bwd_inner_microstep: 310.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:19,980] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:42:20,469] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.15 | bwd_microstep: 310.91 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:42:20,958] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:21,448] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:42:21,936] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 311.18 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:42:22,424] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:42:22,912] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:42:23,041] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.15 | bwd_microstep: 85.70 | bwd_inner_microstep: 85.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:42:23,528] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 311.03 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:42:24,015] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 311.13 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1974 +[2025-04-26 22:42:25,801] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.65 | bwd_microstep: 326.86 | bwd_inner_microstep: 326.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +[2025-04-26 22:42:27,819] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 22:42:28,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 22:42:28,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 22:42:33,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 22:42:33,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 22:42:33,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 22:42:38,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 22:42:38,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 22:42:38,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 22:42:42,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 22:42:42,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 22:42:42,529] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:42:46,049] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 305.98 | bwd_inner_microstep: 305.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:42:46,528] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.12 | bwd_microstep: 306.14 | bwd_inner_microstep: 306.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 886 +[2025-04-26 22:42:46,777] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 84.28 | bwd_microstep: 159.75 | bwd_inner_microstep: 159.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:42:47,255] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.05 | bwd_microstep: 306.62 | bwd_inner_microstep: 306.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 22:42:47,752] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.92 | bwd_microstep: 317.52 | bwd_inner_microstep: 317.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:42:48,232] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.54 | bwd_microstep: 306.25 | bwd_inner_microstep: 306.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:42:48,711] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 305.79 | bwd_inner_microstep: 305.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:42:49,191] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.99 | bwd_microstep: 306.32 | bwd_inner_microstep: 306.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:42:49,672] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 306.57 | bwd_inner_microstep: 306.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 22:42:52,529] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.32 | optimizer_gradients: 17.51 | optimizer_step: 32.03 +[2025-04-26 22:42:52,530] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 126.06 | bwd_microstep: 1454.51 | bwd_inner_microstep: 264.03 | bwd_allreduce_microstep: 1190.44 | step_microstep: 1272.08 +[2025-04-26 22:42:52,531] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4960.37 | bwd: 10244.86 | bwd_inner: 9053.95 | bwd_allreduce: 1190.56 | step: 1273.19 + 67%|██████▋ | 208/309 [1:01:25<40:21, 23.97s/it] {'loss': 0.2542, 'learning_rate': 1.0243622098197456e-05, 'epoch': 2.0} + 67%|██████▋ | 208/309 [1:01:25<40:21, 23.97s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:42:52,995] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.38 | bwd_microstep: 289.55 | bwd_inner_microstep: 289.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:53,475] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.60 | bwd_microstep: 307.18 | bwd_inner_microstep: 307.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:42:53,955] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.94 | bwd_microstep: 306.14 | bwd_inner_microstep: 306.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:54,434] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.90 | bwd_microstep: 306.03 | bwd_inner_microstep: 306.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:42:54,915] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 306.17 | bwd_inner_microstep: 306.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:42:55,396] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.17 | bwd_microstep: 306.42 | bwd_inner_microstep: 306.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:55,877] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 307.03 | bwd_inner_microstep: 307.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:42:56,358] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.05 | bwd_microstep: 307.28 | bwd_inner_microstep: 307.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:42:56,840] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 307.34 | bwd_inner_microstep: 307.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:42:57,321] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 306.99 | bwd_inner_microstep: 306.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:42:57,802] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 306.54 | bwd_inner_microstep: 306.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:42:58,283] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 306.85 | bwd_inner_microstep: 306.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:58,766] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 307.68 | bwd_inner_microstep: 307.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:42:59,249] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 307.37 | bwd_inner_microstep: 307.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:42:59,731] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 307.71 | bwd_inner_microstep: 307.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:00,215] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 307.88 | bwd_inner_microstep: 307.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:00,697] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 307.24 | bwd_inner_microstep: 307.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 879 +[2025-04-26 22:43:00,947] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 84.96 | bwd_microstep: 160.97 | bwd_inner_microstep: 160.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:43:01,430] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 308.49 | bwd_inner_microstep: 308.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:43:01,913] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 307.59 | bwd_inner_microstep: 307.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:43:02,395] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 307.39 | bwd_inner_microstep: 307.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:43:02,876] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 306.42 | bwd_inner_microstep: 306.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1964 +[2025-04-26 22:43:03,380] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.10 | bwd_microstep: 323.88 | bwd_inner_microstep: 323.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1930 +[2025-04-26 22:43:03,882] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.75 | bwd_microstep: 320.19 | bwd_inner_microstep: 320.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 22:43:04,382] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.00 | bwd_microstep: 319.65 | bwd_inner_microstep: 319.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:43:04,868] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 308.47 | bwd_inner_microstep: 308.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:43:05,351] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 308.49 | bwd_inner_microstep: 308.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:43:05,836] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 308.90 | bwd_inner_microstep: 308.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:43:06,321] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 309.40 | bwd_inner_microstep: 309.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:43:06,805] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 309.44 | bwd_inner_microstep: 309.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:43:07,290] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 308.92 | bwd_inner_microstep: 308.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:10,095] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.15 | optimizer_gradients: 17.50 | optimizer_step: 32.03 +[2025-04-26 22:43:10,096] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 1360.43 | bwd_inner_microstep: 337.08 | bwd_allreduce_microstep: 1023.26 | step_microstep: 1269.94 +[2025-04-26 22:43:10,097] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5371.69 | bwd: 10770.06 | bwd_inner: 9746.29 | bwd_allreduce: 1023.40 | step: 1271.01 + 68%|██████▊ | 209/309 [1:01:42<36:44, 22.05s/it] {'loss': 0.2674, 'learning_rate': 1.0060723393695411e-05, 'epoch': 2.01} + 68%|██████▊ | 209/309 [1:01:42<36:44, 22.05s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:10,564] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.49 | bwd_microstep: 292.31 | bwd_inner_microstep: 292.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:43:11,050] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:43:11,534] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 308.34 | bwd_inner_microstep: 308.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:12,019] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 309.13 | bwd_inner_microstep: 309.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:12,505] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 308.97 | bwd_inner_microstep: 308.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:43:12,990] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 309.10 | bwd_inner_microstep: 309.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:13,475] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 309.51 | bwd_inner_microstep: 309.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:13,959] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 308.76 | bwd_inner_microstep: 308.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:43:14,443] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 308.53 | bwd_inner_microstep: 308.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 369 +[2025-04-26 22:43:14,573] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.88 | bwd_microstep: 86.11 | bwd_inner_microstep: 86.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:15,057] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.02 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:43:15,541] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 309.67 | bwd_inner_microstep: 309.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:43:16,026] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 308.52 | bwd_inner_microstep: 308.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:16,512] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 308.85 | bwd_inner_microstep: 308.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:16,998] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 309.83 | bwd_inner_microstep: 309.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 22:43:17,127] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.17 | bwd_microstep: 85.61 | bwd_inner_microstep: 85.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 878 +[2025-04-26 22:43:17,376] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 83.70 | bwd_microstep: 161.51 | bwd_inner_microstep: 161.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:43:17,860] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 308.39 | bwd_inner_microstep: 308.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:43:18,344] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 308.95 | bwd_inner_microstep: 308.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:43:18,829] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 308.59 | bwd_inner_microstep: 308.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:43:19,313] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 308.11 | bwd_inner_microstep: 308.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:43:19,797] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 307.68 | bwd_inner_microstep: 307.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1953 +[2025-04-26 22:43:20,305] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.29 | bwd_microstep: 324.97 | bwd_inner_microstep: 324.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:43:20,809] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.80 | bwd_microstep: 322.06 | bwd_inner_microstep: 322.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 22:43:21,313] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.32 | bwd_microstep: 322.23 | bwd_inner_microstep: 322.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:43:21,800] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:43:22,287] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 22:43:22,417] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.88 | bwd_microstep: 86.81 | bwd_inner_microstep: 86.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:43:22,903] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 311.44 | bwd_inner_microstep: 311.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:43:23,391] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:43:23,878] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:43:27,093] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.65 | optimizer_gradients: 17.54 | optimizer_step: 32.03 +[2025-04-26 22:43:27,094] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.26 | bwd_microstep: 1766.09 | bwd_inner_microstep: 337.94 | bwd_allreduce_microstep: 1428.11 | step_microstep: 1272.49 +[2025-04-26 22:43:27,095] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5015.19 | bwd: 10562.14 | bwd_inner: 9133.56 | bwd_allreduce: 1428.23 | step: 1273.50 + 68%|██████▊ | 210/309 [1:01:59<33:52, 20.53s/it] {'loss': 0.2679, 'learning_rate': 9.878921945663368e-06, 'epoch': 2.02} + 68%|██████▊ | 210/309 [1:01:59<33:52, 20.53s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:43:27,566] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 294.59 | bwd_inner_microstep: 294.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:43:28,054] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:43:28,542] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:43:29,028] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 309.81 | bwd_inner_microstep: 309.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:43:29,517] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.06 | bwd_microstep: 310.88 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:30,004] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:43:30,493] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 310.75 | bwd_inner_microstep: 310.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:30,980] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:31,469] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 311.11 | bwd_inner_microstep: 311.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1399 +[2025-04-26 22:43:31,843] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.55 | bwd_microstep: 239.91 | bwd_inner_microstep: 239.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:32,329] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:32,818] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.21 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:43:33,306] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.01 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:43:33,795] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 311.52 | bwd_inner_microstep: 311.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:43:34,283] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 310.53 | bwd_inner_microstep: 310.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:43:34,771] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:43:35,259] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.36 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:43:35,747] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 310.74 | bwd_inner_microstep: 310.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:43:36,236] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:43:36,724] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:43:37,213] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:43:37,700] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 310.13 | bwd_inner_microstep: 310.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 22:43:38,208] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.92 | bwd_microstep: 324.44 | bwd_inner_microstep: 324.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1923 +[2025-04-26 22:43:38,715] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.53 | bwd_microstep: 324.04 | bwd_inner_microstep: 324.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:43:39,222] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.83 | bwd_microstep: 323.59 | bwd_inner_microstep: 323.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 22:43:39,356] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.34 | bwd_microstep: 88.77 | bwd_inner_microstep: 88.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:43:39,844] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 311.73 | bwd_inner_microstep: 311.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:43:40,334] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 313.64 | bwd_inner_microstep: 313.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:43:40,824] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.94 | bwd_microstep: 311.28 | bwd_inner_microstep: 311.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:43:41,313] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 312.26 | bwd_inner_microstep: 312.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:41,805] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.62 | bwd_microstep: 312.50 | bwd_inner_microstep: 312.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:43:44,267] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.86 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 22:43:44,267] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 1014.90 | bwd_inner_microstep: 625.31 | bwd_allreduce_microstep: 389.55 | step_microstep: 1269.65 +[2025-04-26 22:43:44,269] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5357.57 | bwd: 10383.83 | bwd_inner: 9993.82 | bwd_allreduce: 389.67 | step: 1270.74 + 68%|██████▊ | 211/309 [1:02:16<31:53, 19.53s/it] {'loss': 0.2404, 'learning_rate': 9.698237824255634e-06, 'epoch': 2.03} + 68%|██████▊ | 211/309 [1:02:16<31:53, 19.53s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:44,741] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 295.74 | bwd_inner_microstep: 295.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:45,230] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 311.83 | bwd_inner_microstep: 311.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:43:45,720] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 311.65 | bwd_inner_microstep: 311.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:46,207] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:43:46,697] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.93 | bwd_microstep: 311.96 | bwd_inner_microstep: 311.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:47,186] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.90 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:43:47,319] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.51 | bwd_microstep: 87.95 | bwd_inner_microstep: 87.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:43:47,807] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.19 | bwd_microstep: 311.32 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:43:48,295] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 311.39 | bwd_inner_microstep: 311.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 367 +[2025-04-26 22:43:48,426] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.35 | bwd_microstep: 86.86 | bwd_inner_microstep: 86.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:48,914] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 311.42 | bwd_inner_microstep: 311.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:43:49,402] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 311.17 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:49,893] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 312.58 | bwd_inner_microstep: 312.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:43:50,382] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.33 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:43:50,870] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.50 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:43:51,361] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.35 | bwd_microstep: 312.16 | bwd_inner_microstep: 312.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 879 +[2025-04-26 22:43:51,615] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.79 | bwd_microstep: 162.31 | bwd_inner_microstep: 162.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:43:52,102] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 310.58 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:43:52,592] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 312.66 | bwd_inner_microstep: 312.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 358 +[2025-04-26 22:43:52,724] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.86 | bwd_microstep: 86.81 | bwd_inner_microstep: 86.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:43:53,212] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 311.23 | bwd_inner_microstep: 311.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1986 +[2025-04-26 22:43:53,727] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.15 | bwd_microstep: 329.75 | bwd_inner_microstep: 329.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 22:43:54,236] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.61 | bwd_microstep: 325.01 | bwd_inner_microstep: 325.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:43:54,727] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.98 | bwd_microstep: 311.81 | bwd_inner_microstep: 311.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1410 +[2025-04-26 22:43:55,108] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 131.10 | bwd_microstep: 245.13 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:43:55,602] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.07 | bwd_microstep: 312.59 | bwd_inner_microstep: 312.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:43:56,094] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 312.79 | bwd_inner_microstep: 312.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:43:56,583] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.00 | bwd_microstep: 311.43 | bwd_inner_microstep: 311.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.06 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:43:57,075] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.85 | bwd_microstep: 312.20 | bwd_inner_microstep: 312.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:43:57,566] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.27 | bwd_microstep: 312.84 | bwd_inner_microstep: 312.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:43:58,057] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.13 | bwd_microstep: 311.83 | bwd_inner_microstep: 311.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:44:01,795] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1205.70 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 22:44:01,795] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 2287.62 | bwd_inner_microstep: 340.55 | bwd_allreduce_microstep: 1947.04 | step_microstep: 1273.60 +[2025-04-26 22:44:01,797] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5009.69 | bwd: 11076.74 | bwd_inner: 9129.24 | bwd_allreduce: 1947.15 | step: 1274.82 + 69%|██████▊ | 212/309 [1:02:34<30:35, 18.93s/it] {'loss': 0.227, 'learning_rate': 9.518690976278108e-06, 'epoch': 2.04} + 69%|██████▊ | 212/309 [1:02:34<30:35, 18.93s/it]dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:44:01,911] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.67 | bwd_microstep: 70.39 | bwd_inner_microstep: 70.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:44:02,397] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 310.01 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:44:02,886] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 313.19 | bwd_inner_microstep: 313.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:44:03,374] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:44:03,862] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:44:04,351] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.08 | bwd_microstep: 311.19 | bwd_inner_microstep: 311.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:44:04,839] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 311.24 | bwd_inner_microstep: 311.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:44:05,328] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:44:05,817] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.46 | bwd_microstep: 311.21 | bwd_inner_microstep: 311.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:44:05,948] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.06 | bwd_microstep: 87.09 | bwd_inner_microstep: 87.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:44:06,435] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:06,923] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 311.52 | bwd_inner_microstep: 311.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 22:44:07,055] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.60 | bwd_microstep: 87.03 | bwd_inner_microstep: 87.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:44:07,184] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.37 | bwd_microstep: 86.70 | bwd_inner_microstep: 86.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:44:07,670] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:44:08,158] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 311.35 | bwd_inner_microstep: 311.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:44:08,646] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 310.26 | bwd_inner_microstep: 310.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:44:09,136] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 310.88 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:44:09,625] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.88 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:44:10,113] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:44:10,602] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:44:11,089] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 309.31 | bwd_inner_microstep: 309.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1950 +[2025-04-26 22:44:11,597] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.64 | bwd_microstep: 324.98 | bwd_inner_microstep: 324.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1930 +[2025-04-26 22:44:12,104] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.78 | bwd_microstep: 323.56 | bwd_inner_microstep: 323.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:44:12,594] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.27 | bwd_microstep: 311.89 | bwd_inner_microstep: 311.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:44:13,083] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.16 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:44:13,214] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.54 | bwd_microstep: 87.02 | bwd_inner_microstep: 87.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:44:13,702] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 312.56 | bwd_inner_microstep: 312.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 382 +[2025-04-26 22:44:13,835] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.94 | bwd_microstep: 87.37 | bwd_inner_microstep: 87.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1399 +[2025-04-26 22:44:14,207] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 127.16 | bwd_microstep: 240.29 | bwd_inner_microstep: 240.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:44:14,697] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 312.35 | bwd_inner_microstep: 312.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:44:18,068] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.60 | optimizer_gradients: 17.53 | optimizer_step: 32.06 +[2025-04-26 22:44:18,068] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 1923.47 | bwd_inner_microstep: 339.16 | bwd_allreduce_microstep: 1584.27 | step_microstep: 1271.34 +[2025-04-26 22:44:18,070] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4686.95 | bwd: 10163.17 | bwd_inner: 8578.43 | bwd_allreduce: 1584.40 | step: 1272.45 + 69%|██████▉ | 213/309 [1:02:50<29:00, 18.13s/it] {'loss': 0.1782, 'learning_rate': 9.340301222986232e-06, 'epoch': 2.05} + 69%|██████▉ | 213/309 [1:02:50<29:00, 18.13s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:44:18,541] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 294.58 | bwd_inner_microstep: 294.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:44:19,030] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 312.01 | bwd_inner_microstep: 311.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:44:19,517] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:20,005] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 22:44:20,137] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.24 | bwd_microstep: 87.34 | bwd_inner_microstep: 87.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:44:20,625] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 311.97 | bwd_inner_microstep: 311.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:44:20,757] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.28 | bwd_microstep: 86.84 | bwd_inner_microstep: 86.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:44:21,243] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 311.09 | bwd_inner_microstep: 311.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:44:21,732] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 312.38 | bwd_inner_microstep: 312.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:22,220] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 310.38 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:44:22,707] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:44:22,838] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.07 | bwd_microstep: 86.79 | bwd_inner_microstep: 86.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:44:23,325] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:44:23,815] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:24,304] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 311.67 | bwd_inner_microstep: 311.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:44:24,790] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 309.73 | bwd_inner_microstep: 309.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:44:25,278] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:44:25,767] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.30 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:44:26,254] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 358 +[2025-04-26 22:44:26,382] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.05 | bwd_microstep: 85.17 | bwd_inner_microstep: 85.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1380 +[2025-04-26 22:44:26,753] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 127.52 | bwd_microstep: 238.68 | bwd_inner_microstep: 238.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 406 +[2025-04-26 22:44:26,895] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.24 | bwd_microstep: 95.56 | bwd_inner_microstep: 95.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1962 +[2025-04-26 22:44:27,403] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.93 | bwd_microstep: 326.86 | bwd_inner_microstep: 326.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:44:27,893] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 312.70 | bwd_inner_microstep: 312.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:44:28,024] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.28 | bwd_microstep: 87.01 | bwd_inner_microstep: 87.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:44:28,513] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 312.35 | bwd_inner_microstep: 312.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.06 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:44:29,003] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 312.25 | bwd_inner_microstep: 312.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:44:29,493] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 311.36 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:44:29,981] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 311.18 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:44:30,470] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.29 | bwd_microstep: 311.15 | bwd_inner_microstep: 311.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 378 +[2025-04-26 22:44:30,602] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.40 | bwd_microstep: 86.97 | bwd_inner_microstep: 86.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:35,259] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.63 | optimizer_gradients: 17.53 | optimizer_step: 32.04 +[2025-04-26 22:44:35,259] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 3209.31 | bwd_inner_microstep: 338.87 | bwd_allreduce_microstep: 2870.39 | step_microstep: 1271.42 +[2025-04-26 22:44:35,261] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4548.15 | bwd: 11221.48 | bwd_inner: 8350.62 | bwd_allreduce: 2870.52 | step: 1272.61 + 69%|██████▉ | 214/309 [1:03:07<28:15, 17.85s/it] {'loss': 0.2436, 'learning_rate': 9.163088257896825e-06, 'epoch': 2.06} + 69%|██████▉ | 214/309 [1:03:07<28:15, 17.85s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:44:35,731] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 293.81 | bwd_inner_microstep: 293.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:44:36,218] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:44:36,704] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 309.80 | bwd_inner_microstep: 309.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:44:37,191] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:44:37,679] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.31 | bwd_microstep: 310.61 | bwd_inner_microstep: 310.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:38,166] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:44:38,296] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.82 | bwd_microstep: 86.23 | bwd_inner_microstep: 86.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:44:38,782] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:39,270] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 310.91 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:44:39,755] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 309.59 | bwd_inner_microstep: 309.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:44:40,243] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 310.42 | bwd_inner_microstep: 310.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:40,731] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:44:40,862] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.07 | bwd_microstep: 86.61 | bwd_inner_microstep: 86.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:41,348] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:44:41,835] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 880 +[2025-04-26 22:44:42,088] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.13 | bwd_microstep: 162.34 | bwd_inner_microstep: 162.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:44:42,216] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.01 | bwd_microstep: 85.60 | bwd_inner_microstep: 85.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:44:42,701] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 309.81 | bwd_inner_microstep: 309.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:44:43,189] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:44:43,675] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 309.65 | bwd_inner_microstep: 309.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:44:44,161] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 309.52 | bwd_inner_microstep: 309.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1978 +[2025-04-26 22:44:44,673] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.94 | bwd_microstep: 328.08 | bwd_inner_microstep: 328.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1974 +[2025-04-26 22:44:45,184] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.97 | bwd_microstep: 326.82 | bwd_inner_microstep: 326.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 22:44:45,688] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.16 | bwd_microstep: 322.53 | bwd_inner_microstep: 322.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:44:46,194] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.07 | bwd_microstep: 322.67 | bwd_inner_microstep: 322.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:44:46,684] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.79 | bwd_microstep: 311.85 | bwd_inner_microstep: 311.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1405 +[2025-04-26 22:44:47,058] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.03 | bwd_microstep: 240.13 | bwd_inner_microstep: 240.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:44:47,545] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:44:48,034] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 311.71 | bwd_inner_microstep: 311.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:44:48,523] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 891 +[2025-04-26 22:44:48,776] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.62 | bwd_microstep: 163.16 | bwd_inner_microstep: 163.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:44:51,572] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.61 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 22:44:51,573] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.63 | bwd_microstep: 1350.36 | bwd_inner_microstep: 339.41 | bwd_allreduce_microstep: 1010.91 | step_microstep: 1270.83 +[2025-04-26 22:44:51,574] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4914.70 | bwd: 9979.94 | bwd_inner: 8968.57 | bwd_allreduce: 1011.03 | step: 1271.87 + 70%|██████▉ | 215/309 [1:03:24<27:14, 17.39s/it] {'loss': 0.253, 'learning_rate': 8.987071644613985e-06, 'epoch': 2.07} + 70%|██████▉ | 215/309 [1:03:24<27:14, 17.39s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:44:52,045] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 294.23 | bwd_inner_microstep: 294.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:52,534] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 312.30 | bwd_inner_microstep: 312.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:44:53,020] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 310.13 | bwd_inner_microstep: 310.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:44:53,507] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 310.31 | bwd_inner_microstep: 310.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:44:53,995] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 310.61 | bwd_inner_microstep: 310.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:44:54,482] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 309.99 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:54,969] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 310.19 | bwd_inner_microstep: 310.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:44:55,457] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:44:55,945] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 311.28 | bwd_inner_microstep: 311.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:56,432] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 310.01 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:56,919] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 310.40 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:57,407] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:44:57,895] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 369 +[2025-04-26 22:44:58,025] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.80 | bwd_microstep: 86.08 | bwd_inner_microstep: 86.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:44:58,511] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:44:58,998] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:44:59,486] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 22:44:59,616] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.31 | bwd_microstep: 86.26 | bwd_inner_microstep: 86.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:45:00,102] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:45:00,590] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 310.84 | bwd_inner_microstep: 310.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:45:01,077] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 309.30 | bwd_inner_microstep: 309.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1940 +[2025-04-26 22:45:01,584] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.27 | bwd_microstep: 324.38 | bwd_inner_microstep: 324.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1950 +[2025-04-26 22:45:02,095] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.77 | bwd_microstep: 325.71 | bwd_inner_microstep: 325.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1924 +[2025-04-26 22:45:02,600] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.65 | bwd_microstep: 322.50 | bwd_inner_microstep: 322.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:45:03,090] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 311.89 | bwd_inner_microstep: 311.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:45:03,581] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.31 | bwd_microstep: 311.03 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:45:04,070] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:45:04,558] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.59 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:45:05,047] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:45:05,537] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.93 | bwd_microstep: 311.45 | bwd_inner_microstep: 311.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:45:06,026] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:45:08,623] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.07 | optimizer_gradients: 17.52 | optimizer_step: 32.02 +[2025-04-26 22:45:08,623] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 1148.30 | bwd_inner_microstep: 340.40 | bwd_allreduce_microstep: 807.86 | step_microstep: 1271.69 +[2025-04-26 22:45:08,625] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5266.85 | bwd: 10355.99 | bwd_inner: 9547.67 | bwd_allreduce: 807.97 | step: 1272.78 + 70%|██████▉ | 216/309 [1:03:41<26:47, 17.29s/it] {'loss': 0.2591, 'learning_rate': 8.812270814669338e-06, 'epoch': 2.08} + 70%|██████▉ | 216/309 [1:03:41<26:47, 17.29s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:45:09,096] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 294.37 | bwd_inner_microstep: 294.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:09,583] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 311.24 | bwd_inner_microstep: 311.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:45:10,073] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.36 | bwd_microstep: 311.71 | bwd_inner_microstep: 311.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:10,559] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 309.64 | bwd_inner_microstep: 309.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:45:11,047] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 311.25 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:45:11,537] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.87 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:45:12,023] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:45:12,510] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 309.82 | bwd_inner_microstep: 309.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:12,998] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:45:13,486] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:45:13,616] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.94 | bwd_microstep: 86.47 | bwd_inner_microstep: 86.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:45:14,102] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 311.33 | bwd_inner_microstep: 311.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:45:14,233] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.57 | bwd_microstep: 86.61 | bwd_inner_microstep: 86.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:14,719] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 310.97 | bwd_inner_microstep: 310.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:15,207] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 311.79 | bwd_inner_microstep: 311.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 22:45:15,337] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.10 | bwd_microstep: 86.09 | bwd_inner_microstep: 86.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:45:15,822] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 310.33 | bwd_inner_microstep: 310.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:45:16,310] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 311.36 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:45:16,797] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:45:17,284] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:45:17,771] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 310.21 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2029 +[2025-04-26 22:45:18,290] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.64 | bwd_microstep: 333.78 | bwd_inner_microstep: 333.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 426 +[2025-04-26 22:45:18,435] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 43.09 | bwd_microstep: 97.19 | bwd_inner_microstep: 97.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 22:45:18,939] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.47 | bwd_microstep: 323.14 | bwd_inner_microstep: 323.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:45:19,427] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 312.21 | bwd_inner_microstep: 312.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 22:45:19,933] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.68 | bwd_microstep: 322.61 | bwd_inner_microstep: 322.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:45:20,422] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 311.31 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:45:20,911] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.48 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:45:21,400] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 311.74 | bwd_inner_microstep: 311.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:45:21,889] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.29 | bwd_microstep: 311.15 | bwd_inner_microstep: 311.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:45:22,376] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:45:26,683] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.44 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 22:45:26,684] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.16 | bwd_microstep: 2859.08 | bwd_inner_microstep: 338.59 | bwd_allreduce_microstep: 2520.44 | step_microstep: 1270.15 +[2025-04-26 22:45:26,685] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5001.81 | bwd: 11639.56 | bwd_inner: 9118.66 | bwd_allreduce: 2520.57 | step: 1271.15 + 70%|███████ | 217/309 [1:03:59<26:51, 17.52s/it] {'loss': 0.2712, 'learning_rate': 8.638705065376887e-06, 'epoch': 2.09} + 70%|███████ | 217/309 [1:03:59<26:51, 17.52s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:27,156] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 294.18 | bwd_inner_microstep: 294.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:27,644] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 890 +[2025-04-26 22:45:27,897] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.78 | bwd_microstep: 162.66 | bwd_inner_microstep: 162.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:45:28,382] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 310.61 | bwd_inner_microstep: 310.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:28,869] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.53 | bwd_microstep: 310.88 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:45:29,357] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:29,843] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 310.21 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:30,331] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:30,818] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:45:31,304] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 309.70 | bwd_inner_microstep: 309.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 22:45:31,678] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.57 | bwd_microstep: 240.20 | bwd_inner_microstep: 240.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:45:32,164] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 309.84 | bwd_inner_microstep: 309.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:45:32,650] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:33,138] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:45:33,626] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:45:34,111] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 309.46 | bwd_inner_microstep: 309.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1895 +[2025-04-26 22:45:34,598] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 309.29 | bwd_inner_microstep: 309.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1895 +[2025-04-26 22:45:35,084] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 309.16 | bwd_inner_microstep: 309.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:45:35,571] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 309.53 | bwd_inner_microstep: 309.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:45:36,056] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 309.09 | bwd_inner_microstep: 309.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:45:36,543] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 309.88 | bwd_inner_microstep: 309.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1950 +[2025-04-26 22:45:37,051] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.56 | bwd_microstep: 324.86 | bwd_inner_microstep: 324.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 22:45:37,557] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.40 | bwd_microstep: 323.79 | bwd_inner_microstep: 323.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:45:38,047] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.57 | bwd_microstep: 311.47 | bwd_inner_microstep: 311.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:45:38,535] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 311.64 | bwd_inner_microstep: 311.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:45:39,023] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:45:39,512] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.73 | bwd_microstep: 311.48 | bwd_inner_microstep: 311.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:45:40,000] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 22:45:40,132] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.35 | bwd_microstep: 87.19 | bwd_inner_microstep: 87.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:45:40,619] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 311.70 | bwd_inner_microstep: 311.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:45:41,107] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 311.46 | bwd_inner_microstep: 311.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:45:44,231] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.25 | optimizer_gradients: 17.54 | optimizer_step: 32.02 +[2025-04-26 22:45:44,232] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.48 | bwd_microstep: 1791.95 | bwd_inner_microstep: 116.77 | bwd_allreduce_microstep: 1675.13 | step_microstep: 1289.17 +[2025-04-26 22:45:44,234] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5122.68 | bwd: 10985.44 | bwd_inner: 9309.82 | bwd_allreduce: 1675.26 | step: 1290.23 + 71%|███████ | 218/309 [1:04:16<26:35, 17.53s/it] {'loss': 0.2411, 'learning_rate': 8.466393557702659e-06, 'epoch': 2.1} + 71%|███████ | 218/309 [1:04:16<26:35, 17.53s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:45:44,704] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 293.47 | bwd_inner_microstep: 293.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:45:45,192] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 312.32 | bwd_inner_microstep: 312.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:45:45,679] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.06 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:45:46,167] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 310.45 | bwd_inner_microstep: 310.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:45:46,654] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 309.52 | bwd_inner_microstep: 309.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:45:47,141] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:45:47,629] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 309.97 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:45:48,115] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.63 | bwd_microstep: 309.65 | bwd_inner_microstep: 309.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:48,602] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:49,090] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 309.95 | bwd_inner_microstep: 309.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:49,578] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.72 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:50,067] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.80 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:45:50,555] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:45:51,042] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:45:51,530] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 311.17 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:45:52,019] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 310.58 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:52,506] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 22:45:52,880] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.66 | bwd_microstep: 239.71 | bwd_inner_microstep: 239.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:45:53,366] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:45:53,496] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.02 | bwd_microstep: 85.78 | bwd_inner_microstep: 85.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:45:53,982] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 309.45 | bwd_inner_microstep: 309.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:45:54,467] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 309.28 | bwd_inner_microstep: 309.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1927 +[2025-04-26 22:45:54,973] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.88 | bwd_microstep: 322.95 | bwd_inner_microstep: 322.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1923 +[2025-04-26 22:45:55,478] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.29 | bwd_microstep: 322.78 | bwd_inner_microstep: 322.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:45:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 311.19 | bwd_inner_microstep: 311.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:45:56,456] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.14 | bwd_microstep: 310.86 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:45:56,945] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.31 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:45:57,435] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.78 | bwd_microstep: 311.42 | bwd_inner_microstep: 311.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:45:57,923] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:45:58,413] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 311.80 | bwd_inner_microstep: 311.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:45:58,903] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 312.38 | bwd_inner_microstep: 312.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:01,918] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.51 | optimizer_gradients: 17.56 | optimizer_step: 32.03 +[2025-04-26 22:46:01,919] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 1566.92 | bwd_inner_microstep: 339.97 | bwd_allreduce_microstep: 1226.91 | step_microstep: 1271.49 +[2025-04-26 22:46:01,921] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5356.25 | bwd: 10904.27 | bwd_inner: 9676.90 | bwd_allreduce: 1227.03 | step: 1272.54 + 71%|███████ | 219/309 [1:04:34<26:21, 17.58s/it] {'loss': 0.2139, 'learning_rate': 8.295355314149413e-06, 'epoch': 2.11} + 71%|███████ | 219/309 [1:04:34<26:21, 17.58s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:46:02,390] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 293.58 | bwd_inner_microstep: 293.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:46:02,878] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 312.16 | bwd_inner_microstep: 312.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:46:03,366] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.15 | bwd_microstep: 309.70 | bwd_inner_microstep: 309.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:03,854] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:46:04,342] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.28 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:04,830] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:46:05,318] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.12 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:46:05,805] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 311.03 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:46:06,295] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.23 | bwd_microstep: 311.39 | bwd_inner_microstep: 311.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:06,783] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:46:07,271] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 311.00 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:46:07,758] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 309.82 | bwd_inner_microstep: 309.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:08,246] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:08,734] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.25 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:46:09,222] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:46:09,710] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:10,198] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:46:10,686] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 310.31 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:46:11,172] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 309.39 | bwd_inner_microstep: 309.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:46:11,659] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 309.86 | bwd_inner_microstep: 309.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:46:12,145] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 309.75 | bwd_inner_microstep: 309.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:46:12,632] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 309.32 | bwd_inner_microstep: 309.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1952 +[2025-04-26 22:46:13,139] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.59 | bwd_microstep: 325.02 | bwd_inner_microstep: 325.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:46:13,629] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.59 | bwd_microstep: 311.78 | bwd_inner_microstep: 311.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:46:14,118] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 311.43 | bwd_inner_microstep: 311.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:46:14,607] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 311.59 | bwd_inner_microstep: 311.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:46:15,113] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.95 | bwd_microstep: 323.05 | bwd_inner_microstep: 323.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 22:46:15,243] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.05 | bwd_microstep: 87.16 | bwd_inner_microstep: 87.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 22:46:15,373] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.58 | bwd_microstep: 86.87 | bwd_inner_microstep: 86.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:15,860] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 311.85 | bwd_inner_microstep: 311.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:16,349] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 312.33 | bwd_inner_microstep: 312.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1403 +[2025-04-26 22:46:18,720] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.80 | optimizer_gradients: 17.51 | optimizer_step: 32.03 +[2025-04-26 22:46:18,720] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.18 | bwd_microstep: 965.75 | bwd_inner_microstep: 567.31 | bwd_allreduce_microstep: 398.36 | step_microstep: 1272.35 +[2025-04-26 22:46:18,722] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5218.22 | bwd: 10160.49 | bwd_inner: 9761.64 | bwd_allreduce: 398.46 | step: 1273.39 + 71%|███████ | 220/309 [1:04:51<25:43, 17.34s/it] {'loss': 0.2237, 'learning_rate': 8.125609216656627e-06, 'epoch': 2.12} + 71%|███████ | 220/309 [1:04:51<25:43, 17.34s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:46:19,193] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 294.65 | bwd_inner_microstep: 294.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:19,682] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 312.85 | bwd_inner_microstep: 312.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:46:20,170] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.26 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:20,658] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:46:21,147] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.91 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:46:21,634] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:46:21,765] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.05 | bwd_microstep: 86.84 | bwd_inner_microstep: 86.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:46:22,252] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 311.88 | bwd_inner_microstep: 311.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:46:22,740] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:23,227] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:23,716] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.48 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:46:24,204] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:46:24,693] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.72 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:25,180] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:46:25,312] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.42 | bwd_microstep: 86.56 | bwd_inner_microstep: 86.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:25,799] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 311.69 | bwd_inner_microstep: 311.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:46:26,287] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:46:26,775] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 310.19 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:46:27,262] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:46:27,750] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 309.52 | bwd_inner_microstep: 309.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 357 +[2025-04-26 22:46:27,879] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.43 | bwd_microstep: 85.23 | bwd_inner_microstep: 85.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:46:28,366] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 311.22 | bwd_inner_microstep: 311.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1963 +[2025-04-26 22:46:28,875] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.06 | bwd_microstep: 326.92 | bwd_inner_microstep: 326.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:29,363] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:29,853] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.34 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:30,342] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 311.81 | bwd_inner_microstep: 311.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:46:30,831] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 311.23 | bwd_inner_microstep: 311.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:46:31,320] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 311.50 | bwd_inner_microstep: 311.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:31,811] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.23 | bwd_microstep: 311.89 | bwd_inner_microstep: 311.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:46:32,299] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 311.08 | bwd_inner_microstep: 311.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:46:32,789] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.28 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:46:35,907] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.12 | optimizer_gradients: 17.52 | optimizer_step: 32.02 +[2025-04-26 22:46:35,908] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.95 | bwd_microstep: 1673.18 | bwd_inner_microstep: 339.86 | bwd_allreduce_microstep: 1333.28 | step_microstep: 1268.77 +[2025-04-26 22:46:35,909] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5128.92 | bwd: 10639.77 | bwd_inner: 9306.02 | bwd_allreduce: 1333.40 | step: 1269.81 + 72%|███████▏ | 221/309 [1:05:08<25:22, 17.30s/it] {'loss': 0.1817, 'learning_rate': 7.957174004516015e-06, 'epoch': 2.13} + 72%|███████▏ | 221/309 [1:05:08<25:22, 17.30s/it]dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:46:36,023] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.97 | bwd_microstep: 69.93 | bwd_inner_microstep: 69.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:46:36,509] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:46:36,998] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 312.44 | bwd_inner_microstep: 312.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 22:46:37,129] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.32 | bwd_microstep: 86.92 | bwd_inner_microstep: 86.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:46:37,615] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:38,103] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:38,590] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:46:39,077] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 310.45 | bwd_inner_microstep: 310.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:46:39,565] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:46:40,054] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 311.11 | bwd_inner_microstep: 311.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:40,542] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 367 +[2025-04-26 22:46:40,672] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.21 | bwd_microstep: 86.08 | bwd_inner_microstep: 86.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 880 +[2025-04-26 22:46:40,922] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 83.94 | bwd_microstep: 162.10 | bwd_inner_microstep: 162.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:41,409] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 887 +[2025-04-26 22:46:41,662] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.14 | bwd_microstep: 163.13 | bwd_inner_microstep: 163.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:46:42,149] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.53 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 22:46:42,279] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.99 | bwd_microstep: 86.58 | bwd_inner_microstep: 86.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:42,766] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 311.68 | bwd_inner_microstep: 311.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:46:43,253] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 310.95 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:46:43,740] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:46:44,228] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:46:44,715] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 309.18 | bwd_inner_microstep: 309.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1962 +[2025-04-26 22:46:45,225] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.35 | bwd_microstep: 325.79 | bwd_inner_microstep: 325.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:46:45,731] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.77 | bwd_microstep: 323.67 | bwd_inner_microstep: 323.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:46:46,220] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.04 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:46,710] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 311.67 | bwd_inner_microstep: 311.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 383 +[2025-04-26 22:46:46,842] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.53 | bwd_microstep: 87.32 | bwd_inner_microstep: 87.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:46:47,330] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 312.34 | bwd_inner_microstep: 312.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:46:47,819] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 312.86 | bwd_inner_microstep: 312.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:48,308] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 311.17 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:46:48,796] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 311.78 | bwd_inner_microstep: 311.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:46:53,719] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.02 | optimizer_gradients: 17.52 | optimizer_step: 32.02 +[2025-04-26 22:46:53,719] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.20 | bwd_microstep: 3474.02 | bwd_inner_microstep: 339.09 | bwd_allreduce_microstep: 3134.89 | step_microstep: 1271.67 +[2025-04-26 22:46:53,721] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4685.52 | bwd: 11708.02 | bwd_inner: 8572.66 | bwd_allreduce: 3135.01 | step: 1272.70 + 72%|███████▏ | 222/309 [1:05:26<25:18, 17.45s/it] {'loss': 0.1816, 'learning_rate': 7.790068272302776e-06, 'epoch': 2.14} + 72%|███████▏ | 222/309 [1:05:26<25:18, 17.45s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:46:54,191] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 294.07 | bwd_inner_microstep: 294.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 22:46:54,323] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 41.23 | bwd_microstep: 86.92 | bwd_inner_microstep: 86.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:54,809] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 311.61 | bwd_inner_microstep: 311.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:46:55,296] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 311.21 | bwd_inner_microstep: 311.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:46:55,784] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:56,271] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:46:56,759] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:46:57,246] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 310.19 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:46:57,377] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.98 | bwd_microstep: 86.44 | bwd_inner_microstep: 86.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:57,863] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 310.99 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:46:58,350] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:58,838] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 311.36 | bwd_inner_microstep: 311.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:59,326] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:46:59,813] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:46:59,944] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.08 | bwd_microstep: 86.59 | bwd_inner_microstep: 86.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:00,429] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:47:00,917] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 311.48 | bwd_inner_microstep: 311.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:47:01,404] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:47:01,892] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 310.05 | bwd_inner_microstep: 310.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:47:02,379] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.93 | bwd_microstep: 310.13 | bwd_inner_microstep: 310.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:47:02,866] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 309.86 | bwd_inner_microstep: 309.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2023 +[2025-04-26 22:47:03,384] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.78 | bwd_microstep: 332.35 | bwd_inner_microstep: 332.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1943 +[2025-04-26 22:47:03,892] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.10 | bwd_microstep: 324.04 | bwd_inner_microstep: 324.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 22:47:04,398] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.05 | bwd_microstep: 322.38 | bwd_inner_microstep: 322.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:47:04,886] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:47:05,377] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.32 | bwd_microstep: 311.79 | bwd_inner_microstep: 311.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:47:05,865] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 311.43 | bwd_inner_microstep: 311.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:47:05,995] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.32 | bwd_microstep: 86.67 | bwd_inner_microstep: 86.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:47:06,483] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:47:06,971] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:07,460] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:47:11,651] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.73 | optimizer_gradients: 17.53 | optimizer_step: 32.04 +[2025-04-26 22:47:11,651] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 2743.39 | bwd_inner_microstep: 338.35 | bwd_allreduce_microstep: 2405.00 | step_microstep: 1270.61 +[2025-04-26 22:47:11,653] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5000.55 | bwd: 11511.84 | bwd_inner: 9106.38 | bwd_allreduce: 2405.11 | step: 1271.63 + 72%|███████▏ | 223/309 [1:05:44<25:13, 17.60s/it] {'loss': 0.2413, 'learning_rate': 7.624310467822833e-06, 'epoch': 2.15} + 72%|███████▏ | 223/309 [1:05:44<25:13, 17.60s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:47:12,124] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 293.98 | bwd_inner_microstep: 293.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:47:12,611] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:13,098] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 309.92 | bwd_inner_microstep: 309.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:13,585] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 309.83 | bwd_inner_microstep: 309.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:47:14,073] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 310.05 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1399 +[2025-04-26 22:47:14,446] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.44 | bwd_microstep: 239.68 | bwd_inner_microstep: 239.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:47:14,932] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:47:15,420] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 310.73 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:15,907] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 309.79 | bwd_inner_microstep: 309.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:47:16,394] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:47:16,882] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:47:17,369] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 309.62 | bwd_inner_microstep: 309.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:47:17,855] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 309.84 | bwd_inner_microstep: 309.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:47:18,343] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:47:18,830] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 309.87 | bwd_inner_microstep: 309.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:47:19,316] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 309.56 | bwd_inner_microstep: 309.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:47:19,802] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 308.84 | bwd_inner_microstep: 308.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:47:20,288] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:47:20,775] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 308.96 | bwd_inner_microstep: 308.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:47:21,260] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 308.77 | bwd_inner_microstep: 308.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:47:21,745] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 308.86 | bwd_inner_microstep: 308.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:47:22,233] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 311.07 | bwd_inner_microstep: 311.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1943 +[2025-04-26 22:47:22,740] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.00 | bwd_microstep: 324.14 | bwd_inner_microstep: 324.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 22:47:23,246] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.22 | bwd_microstep: 323.13 | bwd_inner_microstep: 323.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:47:23,734] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.83 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:47:24,223] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 311.10 | bwd_inner_microstep: 311.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1401 +[2025-04-26 22:47:24,597] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.96 | bwd_microstep: 239.95 | bwd_inner_microstep: 239.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.06 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:47:25,083] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 311.23 | bwd_inner_microstep: 311.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 372 +[2025-04-26 22:47:25,214] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.31 | bwd_microstep: 86.44 | bwd_inner_microstep: 86.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:47:25,702] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 311.44 | bwd_inner_microstep: 311.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:47:26,190] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:29,790] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.60 | optimizer_gradients: 17.52 | optimizer_step: 32.02 +[2025-04-26 22:47:29,790] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 2150.59 | bwd_inner_microstep: 340.04 | bwd_allreduce_microstep: 1810.51 | step_microstep: 1272.39 +[2025-04-26 22:47:29,792] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5298.32 | bwd: 11411.66 | bwd_inner: 9600.69 | bwd_allreduce: 1810.62 | step: 1273.47 + 72%|███████▏ | 224/309 [1:06:02<25:09, 17.76s/it] {'loss': 0.2405, 'learning_rate': 7.459918890076272e-06, 'epoch': 2.16} + 72%|███████▏ | 224/309 [1:06:02<25:09, 17.76s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1907 +[2025-04-26 22:47:30,261] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 293.29 | bwd_inner_microstep: 293.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:30,748] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:31,234] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 309.33 | bwd_inner_microstep: 309.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:47:31,721] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:32,209] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 310.19 | bwd_inner_microstep: 310.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:47:32,696] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 310.14 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:47:33,184] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 310.99 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1907 +[2025-04-26 22:47:33,671] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 309.89 | bwd_inner_microstep: 309.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:34,157] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 309.92 | bwd_inner_microstep: 309.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:34,646] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.63 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:47:35,132] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 309.48 | bwd_inner_microstep: 309.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:35,621] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:47:36,107] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 309.95 | bwd_inner_microstep: 309.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:47:36,593] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 309.51 | bwd_inner_microstep: 309.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:47:37,080] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:47:37,567] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:47:38,053] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 309.98 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:47:38,541] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 310.43 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:39,029] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:47:39,517] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.50 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:47:40,005] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 311.00 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:47:40,492] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 309.22 | bwd_inner_microstep: 309.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1999 +[2025-04-26 22:47:41,005] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.51 | bwd_microstep: 330.05 | bwd_inner_microstep: 330.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:47:41,511] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.78 | bwd_microstep: 322.49 | bwd_inner_microstep: 322.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 22:47:42,016] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.84 | bwd_microstep: 322.73 | bwd_inner_microstep: 322.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 22:47:42,522] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.43 | bwd_microstep: 322.63 | bwd_inner_microstep: 322.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:47:43,009] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:47:43,499] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.29 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:47:43,989] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 311.96 | bwd_inner_microstep: 311.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:47:44,477] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 311.22 | bwd_inner_microstep: 311.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:47:44,965] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:47:47,425] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.54 | optimizer_gradients: 17.51 | optimizer_step: 32.03 +[2025-04-26 22:47:47,425] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.39 | bwd_microstep: 1012.76 | bwd_inner_microstep: 623.41 | bwd_allreduce_microstep: 389.32 | step_microstep: 1270.27 +[2025-04-26 22:47:47,427] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5538.96 | bwd: 10671.93 | bwd_inner: 10282.15 | bwd_allreduce: 389.43 | step: 1271.36 + 73%|███████▎ | 225/309 [1:06:20<24:48, 17.72s/it] {'loss': 0.2459, 'learning_rate': 7.296911687237187e-06, 'epoch': 2.17} + 73%|███████▎ | 225/309 [1:06:20<24:48, 17.72s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:47:47,897] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 293.91 | bwd_inner_microstep: 293.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:47:48,028] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.42 | bwd_microstep: 86.44 | bwd_inner_microstep: 86.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:47:48,515] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 313.14 | bwd_inner_microstep: 313.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:47:49,003] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 311.01 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:47:49,490] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 310.83 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:47:49,979] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:47:50,467] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:50,954] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:47:51,442] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:51,930] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:52,418] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 310.74 | bwd_inner_microstep: 310.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:47:52,906] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.24 | bwd_microstep: 310.05 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:47:53,394] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:47:53,883] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.68 | bwd_microstep: 311.36 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:47:54,371] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:47:54,858] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:47:55,347] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.27 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:47:55,834] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:47:56,322] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:47:56,808] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 309.37 | bwd_inner_microstep: 309.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:47:57,294] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 309.41 | bwd_inner_microstep: 309.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1897 +[2025-04-26 22:47:57,782] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 310.04 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1987 +[2025-04-26 22:47:58,296] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.31 | bwd_microstep: 329.66 | bwd_inner_microstep: 329.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 385 +[2025-04-26 22:47:58,436] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.75 | bwd_microstep: 93.60 | bwd_inner_microstep: 93.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:47:58,923] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 311.64 | bwd_inner_microstep: 311.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 22:47:59,055] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.10 | bwd_microstep: 87.40 | bwd_inner_microstep: 87.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:47:59,542] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 311.47 | bwd_inner_microstep: 311.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:48:00,031] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 311.81 | bwd_inner_microstep: 311.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:48:00,520] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.90 | bwd_microstep: 312.11 | bwd_inner_microstep: 312.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 384 +[2025-04-26 22:48:00,652] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.57 | bwd_microstep: 87.50 | bwd_inner_microstep: 87.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 378 +[2025-04-26 22:48:00,782] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.51 | bwd_microstep: 87.07 | bwd_inner_microstep: 87.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:48:04,865] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.07 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 22:48:04,865] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 2635.68 | bwd_inner_microstep: 338.10 | bwd_allreduce_microstep: 2297.54 | step_microstep: 1271.77 +[2025-04-26 22:48:04,867] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4858.93 | bwd: 11161.33 | bwd_inner: 8863.32 | bwd_allreduce: 2297.66 | step: 1272.78 + 73%|███████▎ | 226/309 [1:06:37<24:23, 17.64s/it] {'loss': 0.2203, 'learning_rate': 7.1353068546502144e-06, 'epoch': 2.18} + 73%|███████▎ | 226/309 [1:06:37<24:23, 17.64s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:48:05,337] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 294.08 | bwd_inner_microstep: 294.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:48:05,825] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 312.55 | bwd_inner_microstep: 312.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:48:06,314] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 311.08 | bwd_inner_microstep: 311.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1400 +[2025-04-26 22:48:06,688] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.56 | bwd_microstep: 239.48 | bwd_inner_microstep: 239.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:48:07,174] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:48:07,664] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.67 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:48:08,151] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:48:08,640] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:48:09,128] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:48:09,617] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.07 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:48:09,748] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.82 | bwd_microstep: 86.65 | bwd_inner_microstep: 86.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:48:10,234] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 311.11 | bwd_inner_microstep: 311.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:48:10,722] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:48:11,210] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:48:11,698] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:48:12,186] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:48:12,675] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 22:48:12,806] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.85 | bwd_microstep: 86.45 | bwd_inner_microstep: 86.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:48:13,292] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 311.16 | bwd_inner_microstep: 311.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:48:13,781] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:48:14,266] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 308.88 | bwd_inner_microstep: 308.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:48:14,754] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 309.81 | bwd_inner_microstep: 309.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:48:15,245] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.24 | bwd_microstep: 312.10 | bwd_inner_microstep: 312.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:48:15,734] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 312.13 | bwd_inner_microstep: 312.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:48:16,223] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:48:16,711] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:48:17,201] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 311.32 | bwd_inner_microstep: 311.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:48:17,690] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 311.57 | bwd_inner_microstep: 311.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:48:18,180] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.39 | bwd_microstep: 311.45 | bwd_inner_microstep: 311.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 892 +[2025-04-26 22:48:18,434] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.49 | bwd_microstep: 163.25 | bwd_inner_microstep: 163.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 22:48:18,564] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.07 | bwd_microstep: 86.91 | bwd_inner_microstep: 86.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:48:23,116] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.69 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 22:48:23,116] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 3105.15 | bwd_inner_microstep: 339.98 | bwd_allreduce_microstep: 2765.13 | step_microstep: 1270.56 +[2025-04-26 22:48:23,118] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4980.77 | bwd: 11835.12 | bwd_inner: 9069.53 | bwd_allreduce: 2765.25 | step: 1271.68 + 73%|███████▎ | 227/309 [1:06:55<24:21, 17.82s/it] {'loss': 0.1892, 'learning_rate': 6.975122232843916e-06, 'epoch': 2.19} + 73%|███████▎ | 227/309 [1:06:55<24:21, 17.82s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:48:23,587] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.22 | bwd_microstep: 293.23 | bwd_inner_microstep: 293.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:48:24,075] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 312.27 | bwd_inner_microstep: 312.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1908 +[2025-04-26 22:48:24,562] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 309.69 | bwd_inner_microstep: 309.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:48:25,048] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 310.01 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:48:25,536] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.88 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:48:26,022] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:48:26,510] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:48:26,640] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.77 | bwd_microstep: 86.65 | bwd_inner_microstep: 86.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:48:27,126] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 311.16 | bwd_inner_microstep: 311.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:48:27,614] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 311.21 | bwd_inner_microstep: 311.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:48:28,101] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 310.05 | bwd_inner_microstep: 310.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:48:28,232] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.04 | bwd_microstep: 86.53 | bwd_inner_microstep: 86.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 22:48:28,603] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 126.91 | bwd_microstep: 239.53 | bwd_inner_microstep: 239.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:48:29,090] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:48:29,577] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:48:30,065] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:48:30,553] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:48:31,039] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 309.63 | bwd_inner_microstep: 309.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:48:31,527] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 309.90 | bwd_inner_microstep: 309.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:48:32,014] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 309.54 | bwd_inner_microstep: 309.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:48:32,503] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:48:32,990] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 308.81 | bwd_inner_microstep: 308.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 22:48:33,496] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.03 | bwd_microstep: 323.23 | bwd_inner_microstep: 323.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1925 +[2025-04-26 22:48:34,004] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.96 | bwd_microstep: 323.10 | bwd_inner_microstep: 323.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:48:34,492] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 311.02 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:48:34,981] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:48:35,469] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:48:35,958] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 311.64 | bwd_inner_microstep: 311.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:48:36,446] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:48:36,934] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 311.18 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:48:37,423] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.83 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:48:40,366] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.93 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 22:48:40,366] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.25 | bwd_microstep: 1493.30 | bwd_inner_microstep: 340.62 | bwd_allreduce_microstep: 1152.64 | step_microstep: 1271.77 +[2025-04-26 22:48:40,368] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5209.06 | bwd: 10609.62 | bwd_inner: 9456.51 | bwd_allreduce: 1152.76 | step: 1272.86 + 74%|███████▍ | 228/309 [1:07:13<23:49, 17.65s/it] {'loss': 0.191, 'learning_rate': 6.816375505561248e-06, 'epoch': 2.2} + 74%|███████▍ | 228/309 [1:07:13<23:49, 17.65s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:48:40,839] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 294.03 | bwd_inner_microstep: 294.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:48:41,326] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 311.16 | bwd_inner_microstep: 311.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:48:41,813] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 309.81 | bwd_inner_microstep: 309.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:48:42,302] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:48:42,789] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 309.61 | bwd_inner_microstep: 309.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:48:43,277] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:48:43,764] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 309.84 | bwd_inner_microstep: 309.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:48:44,252] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 310.45 | bwd_inner_microstep: 310.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:48:44,741] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 310.62 | bwd_inner_microstep: 310.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:48:45,229] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:48:45,716] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 309.99 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:48:46,205] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:48:46,693] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:48:47,182] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 310.86 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:48:47,670] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 310.14 | bwd_inner_microstep: 310.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:48:48,159] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 311.69 | bwd_inner_microstep: 311.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:48:48,291] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 41.51 | bwd_microstep: 85.77 | bwd_inner_microstep: 85.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:48:48,777] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:48:49,264] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 310.97 | bwd_inner_microstep: 310.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:48:49,751] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 309.35 | bwd_inner_microstep: 309.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:48:50,237] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 309.24 | bwd_inner_microstep: 309.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:48:50,724] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 309.43 | bwd_inner_microstep: 309.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2011 +[2025-04-26 22:48:51,240] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.64 | bwd_microstep: 330.98 | bwd_inner_microstep: 330.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:48:51,729] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 311.38 | bwd_inner_microstep: 311.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:48:52,220] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.59 | bwd_microstep: 311.63 | bwd_inner_microstep: 311.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 383 +[2025-04-26 22:48:52,352] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.18 | bwd_microstep: 87.35 | bwd_inner_microstep: 87.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:48:52,840] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 312.01 | bwd_inner_microstep: 312.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:48:53,329] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 311.80 | bwd_inner_microstep: 311.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:48:53,818] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 311.20 | bwd_inner_microstep: 311.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:48:54,308] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.20 | bwd_microstep: 311.36 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:48:54,797] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 311.10 | bwd_inner_microstep: 311.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:48:57,257] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.35 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 22:48:57,258] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.00 | bwd_microstep: 1013.25 | bwd_inner_microstep: 623.68 | bwd_allreduce_microstep: 389.53 | step_microstep: 1270.10 +[2025-04-26 22:48:57,259] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5255.11 | bwd: 10199.77 | bwd_inner: 9809.77 | bwd_allreduce: 389.65 | step: 1271.23 + 74%|███████▍ | 229/309 [1:07:29<23:13, 17.42s/it] {'loss': 0.2545, 'learning_rate': 6.659084197807348e-06, 'epoch': 2.21} + 74%|███████▍ | 229/309 [1:07:29<23:13, 17.42s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:48:57,731] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 294.44 | bwd_inner_microstep: 294.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:48:58,220] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 311.79 | bwd_inner_microstep: 311.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:48:58,707] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.87 | bwd_microstep: 310.04 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:48:59,196] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:48:59,684] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.17 | bwd_microstep: 310.43 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 22:48:59,814] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.88 | bwd_microstep: 86.76 | bwd_inner_microstep: 86.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:49:00,302] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 312.61 | bwd_inner_microstep: 312.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:00,790] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:01,278] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 310.73 | bwd_inner_microstep: 310.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:49:01,767] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.37 | bwd_microstep: 311.09 | bwd_inner_microstep: 311.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:02,255] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:02,744] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 311.18 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:49:03,232] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.77 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:03,720] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 310.85 | bwd_inner_microstep: 310.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:04,209] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 311.78 | bwd_inner_microstep: 311.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:04,697] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.06 | bwd_microstep: 311.01 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:49:05,185] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 310.18 | bwd_inner_microstep: 310.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:49:05,673] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.13 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:49:06,161] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 310.86 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:49:06,291] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.91 | bwd_microstep: 86.04 | bwd_inner_microstep: 86.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:49:06,778] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 311.54 | bwd_inner_microstep: 311.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:49:07,265] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 309.44 | bwd_inner_microstep: 309.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1951 +[2025-04-26 22:49:07,772] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.68 | bwd_microstep: 325.01 | bwd_inner_microstep: 324.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 22:49:08,279] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.97 | bwd_microstep: 322.91 | bwd_inner_microstep: 322.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:49:08,768] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 311.31 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:49:09,258] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.47 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:49:09,747] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 311.86 | bwd_inner_microstep: 311.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:49:10,235] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.92 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:49:10,726] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.09 | bwd_microstep: 312.01 | bwd_inner_microstep: 311.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:11,217] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.08 | bwd_microstep: 311.36 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:11,706] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 311.50 | bwd_inner_microstep: 311.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:14,286] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.02 | optimizer_gradients: 17.52 | optimizer_step: 32.05 +[2025-04-26 22:49:14,287] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.49 | bwd_microstep: 1130.90 | bwd_inner_microstep: 375.43 | bwd_allreduce_microstep: 755.36 | step_microstep: 1271.79 +[2025-04-26 22:49:14,288] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5270.37 | bwd: 10333.79 | bwd_inner: 9577.93 | bwd_allreduce: 755.44 | step: 1272.82 + 74%|███████▍ | 230/309 [1:07:46<22:47, 17.30s/it] {'loss': 0.2312, 'learning_rate': 6.503265673914849e-06, 'epoch': 2.21} + 74%|███████▍ | 230/309 [1:07:46<22:47, 17.30s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:14,759] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 294.37 | bwd_inner_microstep: 294.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:49:15,249] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 313.16 | bwd_inner_microstep: 313.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:15,737] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.95 | bwd_microstep: 310.61 | bwd_inner_microstep: 310.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:49:16,224] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:49:16,713] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.66 | bwd_microstep: 310.85 | bwd_inner_microstep: 310.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:17,202] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.13 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:17,690] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.77 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:18,179] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.25 | bwd_microstep: 311.16 | bwd_inner_microstep: 311.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:18,667] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:19,156] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.39 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:49:19,286] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.85 | bwd_microstep: 86.83 | bwd_inner_microstep: 86.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:49:19,774] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 312.10 | bwd_inner_microstep: 312.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:20,263] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 311.32 | bwd_inner_microstep: 311.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:20,751] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 310.73 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:49:21,239] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:49:21,727] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:49:22,214] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 309.77 | bwd_inner_microstep: 309.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:49:22,702] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 310.73 | bwd_inner_microstep: 310.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:49:23,190] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 310.42 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:49:23,676] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 309.54 | bwd_inner_microstep: 309.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:49:24,163] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 309.66 | bwd_inner_microstep: 309.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:49:24,652] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1986 +[2025-04-26 22:49:25,166] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.54 | bwd_microstep: 329.50 | bwd_inner_microstep: 329.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1925 +[2025-04-26 22:49:25,674] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.84 | bwd_microstep: 324.19 | bwd_inner_microstep: 324.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:49:26,164] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.34 | bwd_microstep: 311.97 | bwd_inner_microstep: 311.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:49:26,655] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 312.67 | bwd_inner_microstep: 312.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:49:27,145] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.69 | bwd_microstep: 311.78 | bwd_inner_microstep: 311.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:49:27,634] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 311.60 | bwd_inner_microstep: 311.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:49:28,125] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.28 | bwd_microstep: 312.06 | bwd_inner_microstep: 312.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:49:28,616] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.36 | bwd_microstep: 312.11 | bwd_inner_microstep: 312.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:49:29,105] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 312.21 | bwd_inner_microstep: 312.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:49:31,581] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.37 | optimizer_gradients: 17.51 | optimizer_step: 32.04 +[2025-04-26 22:49:31,582] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.52 | bwd_microstep: 1026.97 | bwd_inner_microstep: 620.59 | bwd_allreduce_microstep: 406.26 | step_microstep: 1271.21 +[2025-04-26 22:49:31,583] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5406.72 | bwd: 10462.36 | bwd_inner: 10055.58 | bwd_allreduce: 406.33 | step: 1272.25 + 75%|███████▍ | 231/309 [1:08:04<22:29, 17.30s/it] {'loss': 0.1733, 'learning_rate': 6.348937135626922e-06, 'epoch': 2.22} + 75%|███████▍ | 231/309 [1:08:04<22:29, 17.30s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:49:32,055] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 295.15 | bwd_inner_microstep: 295.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 888 +[2025-04-26 22:49:32,308] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.57 | bwd_microstep: 162.57 | bwd_inner_microstep: 162.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:49:32,438] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.98 | bwd_microstep: 86.58 | bwd_inner_microstep: 86.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:49:32,925] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:33,414] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 311.43 | bwd_inner_microstep: 311.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:49:33,903] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:34,391] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:34,881] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.43 | bwd_microstep: 311.89 | bwd_inner_microstep: 311.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:35,370] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.97 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:49:35,858] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:49:36,347] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.57 | bwd_microstep: 310.42 | bwd_inner_microstep: 310.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:49:36,477] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.94 | bwd_microstep: 86.67 | bwd_inner_microstep: 86.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:36,964] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 312.12 | bwd_inner_microstep: 312.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:37,453] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 311.17 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:49:37,941] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:38,430] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.33 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:38,918] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 311.35 | bwd_inner_microstep: 311.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:49:39,406] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 311.17 | bwd_inner_microstep: 311.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 22:49:39,537] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.79 | bwd_microstep: 86.32 | bwd_inner_microstep: 86.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:49:40,022] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:49:40,509] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.83 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2037 +[2025-04-26 22:49:41,029] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.33 | bwd_microstep: 334.66 | bwd_inner_microstep: 334.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:49:41,518] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 311.49 | bwd_inner_microstep: 311.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:49:42,009] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.44 | bwd_microstep: 312.14 | bwd_inner_microstep: 312.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:49:42,498] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.04 | bwd_microstep: 311.53 | bwd_inner_microstep: 311.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:49:42,988] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.85 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:49:43,478] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.28 | bwd_microstep: 312.26 | bwd_inner_microstep: 312.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:49:43,967] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 311.66 | bwd_inner_microstep: 311.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:49:44,456] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 311.32 | bwd_inner_microstep: 311.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:49:44,945] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.50 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:49:45,435] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.57 | bwd_microstep: 311.61 | bwd_inner_microstep: 311.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 893 +[2025-04-26 22:49:48,855] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.52 | optimizer_gradients: 17.53 | optimizer_step: 32.05 +[2025-04-26 22:49:48,856] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.91 | bwd_microstep: 2058.51 | bwd_inner_microstep: 193.24 | bwd_allreduce_microstep: 1865.23 | step_microstep: 1271.27 +[2025-04-26 22:49:48,857] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4957.54 | bwd: 10892.45 | bwd_inner: 9026.76 | bwd_allreduce: 1865.35 | step: 1272.30 + 75%|███████▌ | 232/309 [1:08:21<22:11, 17.29s/it] {'loss': 0.176, 'learning_rate': 6.196115620198271e-06, 'epoch': 2.23} + 75%|███████▌ | 232/309 [1:08:21<22:11, 17.29s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:49:49,328] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 293.66 | bwd_inner_microstep: 293.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:49:49,816] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 311.99 | bwd_inner_microstep: 311.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:50,304] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:49:50,792] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 310.97 | bwd_inner_microstep: 310.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:51,280] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:51,768] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 310.43 | bwd_inner_microstep: 310.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:49:52,256] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.90 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:49:52,745] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:53,233] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:53,720] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:54,209] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 310.61 | bwd_inner_microstep: 310.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:49:54,697] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.43 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:49:55,186] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 310.58 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:49:55,675] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 310.91 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:56,163] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.38 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:49:56,652] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.39 | bwd_microstep: 311.15 | bwd_inner_microstep: 311.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 880 +[2025-04-26 22:49:56,905] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.78 | bwd_microstep: 162.40 | bwd_inner_microstep: 162.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:49:57,392] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:49:57,880] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:49:58,367] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 309.37 | bwd_inner_microstep: 309.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:49:58,854] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 309.90 | bwd_inner_microstep: 309.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2010 +[2025-04-26 22:49:59,371] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.12 | bwd_microstep: 331.01 | bwd_inner_microstep: 331.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 22:49:59,878] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.23 | bwd_microstep: 323.42 | bwd_inner_microstep: 323.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:50:00,386] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.21 | bwd_microstep: 323.38 | bwd_inner_microstep: 323.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 378 +[2025-04-26 22:50:00,517] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.12 | bwd_microstep: 87.05 | bwd_inner_microstep: 87.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:50:01,008] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:50:01,498] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 312.68 | bwd_inner_microstep: 312.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:50:01,987] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.62 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:50:02,476] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 22:50:02,608] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.29 | bwd_microstep: 87.14 | bwd_inner_microstep: 87.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 22:50:02,738] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.63 | bwd_microstep: 87.26 | bwd_inner_microstep: 87.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:50:04,925] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.94 | optimizer_gradients: 17.53 | optimizer_step: 32.04 +[2025-04-26 22:50:04,925] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.58 | bwd_microstep: 875.00 | bwd_inner_microstep: 358.74 | bwd_allreduce_microstep: 516.13 | step_microstep: 1269.65 +[2025-04-26 22:50:04,927] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4915.22 | bwd: 9718.00 | bwd_inner: 9201.37 | bwd_allreduce: 516.21 | step: 1270.81 + 75%|███████▌ | 233/309 [1:08:37<21:26, 16.93s/it] {'loss': 0.2399, 'learning_rate': 6.044817998514296e-06, 'epoch': 2.24} + 75%|███████▌ | 233/309 [1:08:37<21:26, 16.93s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:50:05,398] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 295.45 | bwd_inner_microstep: 295.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:50:05,887] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:50:06,373] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:06,862] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.19 | bwd_microstep: 310.62 | bwd_inner_microstep: 310.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:50:07,351] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:07,839] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 310.93 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:50:08,328] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:50:08,817] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 311.09 | bwd_inner_microstep: 311.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:09,305] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.62 | bwd_inner_microstep: 310.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:09,793] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 311.03 | bwd_inner_microstep: 311.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.19 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:10,282] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.93 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 22:50:10,412] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.77 | bwd_microstep: 86.04 | bwd_inner_microstep: 86.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:50:10,900] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 311.71 | bwd_inner_microstep: 311.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:50:11,388] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:50:11,878] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.87 | bwd_microstep: 311.35 | bwd_inner_microstep: 311.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:50:12,365] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:50:12,853] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:50:13,343] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.48 | bwd_microstep: 311.16 | bwd_inner_microstep: 311.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:50:13,831] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1895 +[2025-04-26 22:50:14,318] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 309.64 | bwd_inner_microstep: 309.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:50:14,806] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 309.42 | bwd_inner_microstep: 309.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 406 +[2025-04-26 22:50:14,948] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 43.33 | bwd_microstep: 95.17 | bwd_inner_microstep: 95.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1975 +[2025-04-26 22:50:15,460] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.62 | bwd_microstep: 328.53 | bwd_inner_microstep: 328.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:50:15,948] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 311.67 | bwd_inner_microstep: 311.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:50:16,455] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.63 | bwd_microstep: 323.56 | bwd_inner_microstep: 323.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:50:16,945] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.06 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:50:17,435] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 312.25 | bwd_inner_microstep: 312.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:50:17,942] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.28 | bwd_microstep: 323.51 | bwd_inner_microstep: 323.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:50:18,432] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.30 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:50:18,922] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.21 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:50:19,411] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.59 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:50:22,013] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.43 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 22:50:22,014] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.49 | bwd_microstep: 1285.80 | bwd_inner_microstep: 115.83 | bwd_allreduce_microstep: 1169.93 | step_microstep: 1272.27 +[2025-04-26 22:50:22,015] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5143.05 | bwd: 10510.96 | bwd_inner: 9340.56 | bwd_allreduce: 1170.05 | step: 1273.54 + 76%|███████▌ | 234/309 [1:08:54<21:13, 16.97s/it] {'loss': 0.2473, 'learning_rate': 5.895060973228606e-06, 'epoch': 2.25} + 76%|███████▌ | 234/309 [1:08:54<21:13, 16.97s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1907 +[2025-04-26 22:50:22,488] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 295.14 | bwd_inner_microstep: 295.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:50:22,977] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:23,463] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 309.99 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 22:50:23,594] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.03 | bwd_microstep: 86.41 | bwd_inner_microstep: 86.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 22:50:23,723] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.98 | bwd_microstep: 86.40 | bwd_inner_microstep: 86.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:50:24,210] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 310.97 | bwd_inner_microstep: 310.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:50:24,698] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 311.31 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:25,185] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:50:25,673] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 310.06 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:50:26,161] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.77 | bwd_microstep: 310.85 | bwd_inner_microstep: 310.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:50:26,650] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:50:27,137] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 310.18 | bwd_inner_microstep: 310.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:27,627] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:50:28,115] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 311.44 | bwd_inner_microstep: 311.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:50:28,603] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:50:29,092] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:50:29,581] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:50:30,069] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:50:30,558] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.55 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:50:31,045] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:50:31,535] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.14 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:50:32,021] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 309.77 | bwd_inner_microstep: 309.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1950 +[2025-04-26 22:50:32,529] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.82 | bwd_microstep: 325.07 | bwd_inner_microstep: 325.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:50:33,020] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.77 | bwd_microstep: 311.79 | bwd_inner_microstep: 311.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:50:33,510] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 311.61 | bwd_inner_microstep: 311.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:50:33,999] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 311.63 | bwd_inner_microstep: 311.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:50:34,488] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 311.69 | bwd_inner_microstep: 311.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:50:34,978] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.21 | bwd_microstep: 311.49 | bwd_inner_microstep: 311.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:50:35,467] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.73 | bwd_microstep: 311.46 | bwd_inner_microstep: 311.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:50:35,598] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.46 | bwd_microstep: 86.68 | bwd_inner_microstep: 86.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:50:36,086] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 311.97 | bwd_inner_microstep: 311.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 22:50:39,825] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.69 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 22:50:39,825] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.86 | bwd_microstep: 2424.47 | bwd_inner_microstep: 115.37 | bwd_allreduce_microstep: 2309.06 | step_microstep: 1270.39 +[2025-04-26 22:50:39,827] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4989.18 | bwd: 11388.66 | bwd_inner: 9079.13 | bwd_allreduce: 2309.18 | step: 1271.50 + 76%|███████▌ | 235/309 [1:09:12<21:14, 17.23s/it] {'loss': 0.248, 'learning_rate': 5.7468610769191195e-06, 'epoch': 2.26} + 76%|███████▌ | 235/309 [1:09:12<21:14, 17.23s/it]dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:50:39,939] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 36.55 | bwd_microstep: 70.08 | bwd_inner_microstep: 70.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:50:40,424] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 309.71 | bwd_inner_microstep: 309.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:50:40,555] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.93 | bwd_microstep: 86.76 | bwd_inner_microstep: 86.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:50:41,044] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 312.25 | bwd_inner_microstep: 312.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:50:41,531] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:50:42,018] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:42,505] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 309.80 | bwd_inner_microstep: 309.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:42,993] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:43,480] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.01 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:43,969] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 311.15 | bwd_inner_microstep: 311.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:50:44,456] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 309.32 | bwd_inner_microstep: 309.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:50:44,943] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:50:45,431] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.16 | bwd_microstep: 309.77 | bwd_inner_microstep: 309.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:50:45,918] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 309.85 | bwd_inner_microstep: 309.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:50:46,405] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 309.75 | bwd_inner_microstep: 309.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:50:46,892] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 309.80 | bwd_inner_microstep: 309.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:47,382] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:47,871] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:48,359] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:50:48,847] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:50:49,334] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 356 +[2025-04-26 22:50:49,464] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.80 | bwd_microstep: 84.89 | bwd_inner_microstep: 84.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1941 +[2025-04-26 22:50:49,971] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.86 | bwd_microstep: 324.02 | bwd_inner_microstep: 324.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 22:50:50,476] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.90 | bwd_microstep: 323.79 | bwd_inner_microstep: 323.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 384 +[2025-04-26 22:50:50,608] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.38 | bwd_microstep: 87.12 | bwd_inner_microstep: 87.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:50:51,096] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 311.25 | bwd_inner_microstep: 311.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:50:51,584] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 311.52 | bwd_inner_microstep: 311.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:50:52,074] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 311.95 | bwd_inner_microstep: 311.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:50:52,563] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 311.49 | bwd_inner_microstep: 311.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:50:53,052] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:50:53,540] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:50:57,344] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.51 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 22:50:57,345] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 2354.23 | bwd_inner_microstep: 337.83 | bwd_allreduce_microstep: 2016.36 | step_microstep: 1272.33 +[2025-04-26 22:50:57,346] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4987.15 | bwd: 11093.22 | bwd_inner: 9076.38 | bwd_allreduce: 2016.48 | step: 1273.46 + 76%|███████▋ | 236/309 [1:09:30<21:03, 17.31s/it] {'loss': 0.2179, 'learning_rate': 5.600234670262925e-06, 'epoch': 2.27} + 76%|███████▋ | 236/309 [1:09:30<21:03, 17.31s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:50:57,816] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 294.03 | bwd_inner_microstep: 294.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:50:58,304] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:50:58,791] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 310.27 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:50:58,922] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.98 | bwd_microstep: 86.14 | bwd_inner_microstep: 86.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:50:59,408] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:50:59,895] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:00,382] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 310.18 | bwd_inner_microstep: 310.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:00,869] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:01,357] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:51:01,845] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.07 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:02,334] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1907 +[2025-04-26 22:51:02,821] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:03,310] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:51:03,797] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 310.33 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:51:04,284] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.06 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:51:04,771] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:51:05,258] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:51:05,745] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 309.55 | bwd_inner_microstep: 309.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:51:06,233] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:51:06,719] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 309.30 | bwd_inner_microstep: 309.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:51:07,206] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 309.20 | bwd_inner_microstep: 309.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 512 +[2025-04-26 22:51:07,358] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 44.31 | bwd_microstep: 104.22 | bwd_inner_microstep: 104.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1964 +[2025-04-26 22:51:07,868] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.52 | bwd_microstep: 326.84 | bwd_inner_microstep: 326.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1924 +[2025-04-26 22:51:08,373] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.94 | bwd_microstep: 322.93 | bwd_inner_microstep: 322.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 22:51:08,879] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.42 | bwd_microstep: 323.19 | bwd_inner_microstep: 323.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 372 +[2025-04-26 22:51:09,009] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.68 | bwd_microstep: 86.40 | bwd_inner_microstep: 86.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:51:09,513] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.80 | bwd_microstep: 323.69 | bwd_inner_microstep: 323.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:51:10,001] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 311.36 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:51:10,490] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 311.62 | bwd_inner_microstep: 311.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:51:10,978] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 311.53 | bwd_inner_microstep: 311.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:51:11,467] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:51:14,887] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.58 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 22:51:14,888] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 1957.24 | bwd_inner_microstep: 339.55 | bwd_allreduce_microstep: 1617.64 | step_microstep: 1285.25 +[2025-04-26 22:51:14,889] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5132.73 | bwd: 10965.48 | bwd_inner: 9347.38 | bwd_allreduce: 1617.76 | step: 1286.37 + 77%|███████▋ | 237/309 [1:09:47<20:51, 17.38s/it] {'loss': 0.277, 'learning_rate': 5.455197940230137e-06, 'epoch': 2.28} + 77%|███████▋ | 237/309 [1:09:47<20:51, 17.38s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:51:15,360] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 294.88 | bwd_inner_microstep: 294.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:51:15,849] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 311.02 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:51:16,335] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 309.88 | bwd_inner_microstep: 309.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:51:16,823] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 309.81 | bwd_inner_microstep: 309.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 22:51:16,954] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.01 | bwd_microstep: 86.90 | bwd_inner_microstep: 86.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:51:17,441] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 311.22 | bwd_inner_microstep: 311.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:51:17,928] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:51:18,416] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 311.10 | bwd_inner_microstep: 311.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:51:18,904] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.40 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:51:19,390] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 309.49 | bwd_inner_microstep: 309.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:19,878] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.97 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:51:20,366] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.06 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 881 +[2025-04-26 22:51:20,619] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.69 | bwd_microstep: 162.69 | bwd_inner_microstep: 162.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:51:21,106] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:21,594] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.77 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:51:22,081] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:51:22,568] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:51:23,056] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:51:23,543] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 309.72 | bwd_inner_microstep: 309.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:51:23,673] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.81 | bwd_microstep: 85.62 | bwd_inner_microstep: 85.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 869 +[2025-04-26 22:51:23,921] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 83.51 | bwd_microstep: 160.92 | bwd_inner_microstep: 160.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 355 +[2025-04-26 22:51:24,059] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 49.80 | bwd_microstep: 81.93 | bwd_inner_microstep: 81.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2034 +[2025-04-26 22:51:24,577] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.27 | bwd_microstep: 333.77 | bwd_inner_microstep: 333.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1924 +[2025-04-26 22:51:25,082] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.76 | bwd_microstep: 323.20 | bwd_inner_microstep: 323.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1927 +[2025-04-26 22:51:25,589] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.39 | bwd_microstep: 323.02 | bwd_inner_microstep: 323.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:51:26,077] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:51:26,567] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.40 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 22:51:26,699] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.27 | bwd_microstep: 87.11 | bwd_inner_microstep: 87.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:51:27,185] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 311.56 | bwd_inner_microstep: 311.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:51:27,674] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 312.23 | bwd_inner_microstep: 312.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:51:28,161] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1400 +[2025-04-26 22:51:32,556] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.17 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 22:51:32,556] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.59 | bwd_microstep: 2989.38 | bwd_inner_microstep: 267.29 | bwd_allreduce_microstep: 2722.05 | step_microstep: 1272.04 +[2025-04-26 22:51:32,558] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4782.38 | bwd: 11452.07 | bwd_inner: 8729.55 | bwd_allreduce: 2722.17 | step: 1273.19 + 77%|███████▋ | 238/309 [1:10:05<20:40, 17.47s/it] {'loss': 0.221, 'learning_rate': 5.311766898296915e-06, 'epoch': 2.29} + 77%|███████▋ | 238/309 [1:10:05<20:40, 17.47s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1908 +[2025-04-26 22:51:33,026] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.57 | bwd_microstep: 293.18 | bwd_inner_microstep: 293.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:33,515] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 311.07 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:51:34,003] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 309.88 | bwd_inner_microstep: 309.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:51:34,490] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:51:34,976] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 309.29 | bwd_inner_microstep: 309.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:51:35,464] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:51:35,951] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 309.97 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:51:36,437] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 309.61 | bwd_inner_microstep: 309.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:36,924] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 309.78 | bwd_inner_microstep: 309.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:51:37,411] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 310.04 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 882 +[2025-04-26 22:51:37,663] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.33 | bwd_microstep: 162.21 | bwd_inner_microstep: 162.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:38,149] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 311.00 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:51:38,635] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:39,122] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 310.27 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:51:39,252] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.22 | bwd_microstep: 86.31 | bwd_inner_microstep: 86.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:39,737] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:40,225] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 311.33 | bwd_inner_microstep: 311.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:51:40,712] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 309.59 | bwd_inner_microstep: 309.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 878 +[2025-04-26 22:51:40,964] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.32 | bwd_microstep: 162.36 | bwd_inner_microstep: 162.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:51:41,451] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:51:41,936] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 309.14 | bwd_inner_microstep: 309.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:51:42,422] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 309.01 | bwd_inner_microstep: 308.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 22:51:42,928] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.71 | bwd_microstep: 323.21 | bwd_inner_microstep: 323.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:51:43,415] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.45 | bwd_inner_microstep: 310.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:51:43,905] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:51:44,393] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 311.31 | bwd_inner_microstep: 311.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:51:44,882] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:51:45,388] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.36 | bwd_microstep: 322.61 | bwd_inner_microstep: 322.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:45,876] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.19 | bwd_microstep: 309.88 | bwd_inner_microstep: 309.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:51:46,365] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:51:46,855] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.54 | bwd_microstep: 311.32 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:51:49,608] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.27 | optimizer_gradients: 17.51 | optimizer_step: 32.05 +[2025-04-26 22:51:49,608] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 1305.59 | bwd_inner_microstep: 340.40 | bwd_allreduce_microstep: 965.15 | step_microstep: 1269.97 +[2025-04-26 22:51:49,610] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5201.65 | bwd: 10413.14 | bwd_inner: 9447.52 | bwd_allreduce: 965.27 | step: 1271.10 + 77%|███████▋ | 239/309 [1:10:22<20:14, 17.34s/it] {'loss': 0.2477, 'learning_rate': 5.169957378677859e-06, 'epoch': 2.3} + 77%|███████▋ | 239/309 [1:10:22<20:14, 17.34s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:50,078] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.54 | bwd_microstep: 292.91 | bwd_inner_microstep: 292.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:51:50,568] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 312.30 | bwd_inner_microstep: 312.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:51:50,698] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.76 | bwd_microstep: 86.56 | bwd_inner_microstep: 86.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:51:51,185] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 311.31 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:51:51,673] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 311.08 | bwd_inner_microstep: 311.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:51:52,161] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:51:52,293] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.74 | bwd_microstep: 86.37 | bwd_inner_microstep: 86.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:51:52,779] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:51:53,267] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 311.19 | bwd_inner_microstep: 311.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.06 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:51:53,399] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.75 | bwd_microstep: 86.51 | bwd_inner_microstep: 86.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:51:53,885] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:54,372] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 310.53 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:51:54,858] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 309.27 | bwd_inner_microstep: 309.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:51:55,346] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 310.13 | bwd_inner_microstep: 310.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:51:55,835] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 310.31 | bwd_inner_microstep: 310.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:51:56,322] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:51:56,809] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 309.95 | bwd_inner_microstep: 309.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:51:57,297] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:51:57,784] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:51:58,272] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 309.57 | bwd_inner_microstep: 309.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:51:58,758] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 308.81 | bwd_inner_microstep: 308.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2058 +[2025-04-26 22:51:59,285] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.41 | bwd_microstep: 338.83 | bwd_inner_microstep: 338.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1944 +[2025-04-26 22:51:59,792] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.42 | bwd_microstep: 324.04 | bwd_inner_microstep: 324.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1924 +[2025-04-26 22:52:00,299] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.16 | bwd_microstep: 323.07 | bwd_inner_microstep: 323.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:52:00,790] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 311.13 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:52:01,279] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:52:01,768] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:52:02,274] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.14 | bwd_microstep: 322.71 | bwd_inner_microstep: 322.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:52:02,762] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.35 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:52:03,250] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:52:03,738] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:52:06,197] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.54 | optimizer_gradients: 17.51 | optimizer_step: 32.04 +[2025-04-26 22:52:06,198] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.47 | bwd_microstep: 1010.66 | bwd_inner_microstep: 622.56 | bwd_allreduce_microstep: 388.07 | step_microstep: 1271.32 +[2025-04-26 22:52:06,199] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5138.19 | bwd: 10014.50 | bwd_inner: 9625.97 | bwd_allreduce: 388.19 | step: 1272.54 + 78%|███████▊ | 240/309 [1:10:38<19:41, 17.12s/it] {'loss': 0.2348, 'learning_rate': 5.029785036577976e-06, 'epoch': 2.31} + 78%|███████▊ | 240/309 [1:10:38<19:41, 17.12s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:52:06,671] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.89 | bwd_microstep: 294.35 | bwd_inner_microstep: 294.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:52:07,160] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 312.07 | bwd_inner_microstep: 312.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:52:07,648] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:52:07,779] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.83 | bwd_microstep: 86.38 | bwd_inner_microstep: 86.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:52:08,266] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 311.82 | bwd_inner_microstep: 311.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:52:08,753] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 311.35 | bwd_inner_microstep: 311.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:52:09,240] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 309.86 | bwd_inner_microstep: 309.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:52:09,729] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.90 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:52:10,217] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:10,706] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 311.33 | bwd_inner_microstep: 311.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:11,194] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.13 | bwd_microstep: 310.38 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:52:11,325] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.97 | bwd_microstep: 85.79 | bwd_inner_microstep: 85.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:52:11,811] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 311.19 | bwd_inner_microstep: 311.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:52:12,299] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 311.53 | bwd_inner_microstep: 311.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:52:12,786] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 309.95 | bwd_inner_microstep: 309.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:52:13,274] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:13,763] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.31 | bwd_microstep: 310.62 | bwd_inner_microstep: 310.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:52:14,250] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 309.84 | bwd_inner_microstep: 309.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:52:14,739] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 878 +[2025-04-26 22:52:14,991] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.54 | bwd_microstep: 162.08 | bwd_inner_microstep: 162.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:52:15,478] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:52:15,965] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 309.78 | bwd_inner_microstep: 309.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1927 +[2025-04-26 22:52:16,471] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.60 | bwd_microstep: 323.49 | bwd_inner_microstep: 323.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 388 +[2025-04-26 22:52:16,612] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.82 | bwd_microstep: 93.60 | bwd_inner_microstep: 93.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1923 +[2025-04-26 22:52:17,117] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.53 | bwd_microstep: 323.77 | bwd_inner_microstep: 323.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:52:17,606] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 311.79 | bwd_inner_microstep: 311.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:52:18,096] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 312.03 | bwd_inner_microstep: 312.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:52:18,585] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.83 | bwd_microstep: 311.52 | bwd_inner_microstep: 311.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:52:19,074] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 311.22 | bwd_inner_microstep: 311.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 22:52:19,581] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.89 | bwd_microstep: 322.98 | bwd_inner_microstep: 322.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:52:20,070] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:52:23,699] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.06 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 22:52:23,699] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.13 | bwd_microstep: 2181.02 | bwd_inner_microstep: 340.12 | bwd_allreduce_microstep: 1840.86 | step_microstep: 1270.74 +[2025-04-26 22:52:23,701] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5044.34 | bwd: 11024.87 | bwd_inner: 9183.54 | bwd_allreduce: 1840.98 | step: 1271.88 + 78%|███████▊ | 241/309 [1:10:56<19:31, 17.23s/it] {'loss': 0.1883, 'learning_rate': 4.891265346464416e-06, 'epoch': 2.32} + 78%|███████▊ | 241/309 [1:10:56<19:31, 17.23s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:52:24,171] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 294.22 | bwd_inner_microstep: 294.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:52:24,660] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 311.95 | bwd_inner_microstep: 311.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:52:25,148] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:52:25,636] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:26,124] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:52:26,612] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:52:27,100] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:52:27,588] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.58 | bwd_inner_microstep: 310.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:52:28,077] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.92 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:52:28,564] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.06 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:29,052] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 310.27 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:52:29,539] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 310.03 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:52:30,028] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 311.34 | bwd_inner_microstep: 311.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:30,516] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:52:31,004] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:52:31,493] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.68 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:52:31,980] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 309.89 | bwd_inner_microstep: 309.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 367 +[2025-04-26 22:52:32,111] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.48 | bwd_microstep: 86.08 | bwd_inner_microstep: 86.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:52:32,597] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 310.38 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:52:33,084] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 355 +[2025-04-26 22:52:33,209] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.66 | bwd_microstep: 83.66 | bwd_inner_microstep: 83.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1964 +[2025-04-26 22:52:33,717] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.95 | bwd_microstep: 326.72 | bwd_inner_microstep: 326.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2023 +[2025-04-26 22:52:34,236] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.29 | bwd_microstep: 333.59 | bwd_inner_microstep: 333.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:52:34,724] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1927 +[2025-04-26 22:52:35,231] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.38 | bwd_microstep: 323.45 | bwd_inner_microstep: 323.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:52:35,720] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.07 | bwd_microstep: 311.30 | bwd_inner_microstep: 311.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:52:36,210] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.26 | bwd_microstep: 311.57 | bwd_inner_microstep: 311.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:52:36,701] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.37 | bwd_microstep: 311.82 | bwd_inner_microstep: 311.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:52:37,189] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:52:37,678] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:38,167] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:52:41,452] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1205.10 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 22:52:41,453] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 1835.07 | bwd_inner_microstep: 340.48 | bwd_allreduce_microstep: 1494.56 | step_microstep: 1272.92 +[2025-04-26 22:52:41,454] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5267.06 | bwd: 11050.48 | bwd_inner: 9555.46 | bwd_allreduce: 1494.67 | step: 1274.03 + 78%|███████▊ | 242/309 [1:11:14<19:25, 17.39s/it] {'loss': 0.2551, 'learning_rate': 4.7544136003581365e-06, 'epoch': 2.33} + 78%|███████▊ | 242/309 [1:11:14<19:25, 17.39s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:52:41,925] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 295.09 | bwd_inner_microstep: 295.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:52:42,414] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 310.73 | bwd_inner_microstep: 310.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:52:42,901] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 310.73 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:43,390] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.82 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:43,876] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 310.03 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:52:44,364] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:44,853] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.00 | bwd_microstep: 310.86 | bwd_inner_microstep: 310.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:52:45,341] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:52:45,828] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:52:46,315] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:46,804] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 311.10 | bwd_inner_microstep: 311.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:52:47,291] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 309.72 | bwd_inner_microstep: 309.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:52:47,779] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:52:48,268] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:52:48,756] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 311.01 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:52:49,244] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:52:49,732] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:52:50,219] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 309.84 | bwd_inner_microstep: 309.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:52:50,707] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:52:51,194] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:52:51,682] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 309.80 | bwd_inner_microstep: 309.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:52:52,169] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 309.47 | bwd_inner_microstep: 309.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:52:52,658] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 311.97 | bwd_inner_microstep: 311.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:52:53,148] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 311.63 | bwd_inner_microstep: 311.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:52:53,637] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 22:52:54,142] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.04 | bwd_microstep: 322.63 | bwd_inner_microstep: 322.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:52:54,631] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.39 | bwd_microstep: 311.03 | bwd_inner_microstep: 311.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:52:55,120] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 311.03 | bwd_inner_microstep: 311.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:52:55,252] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.36 | bwd_microstep: 87.06 | bwd_inner_microstep: 87.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:52:55,739] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 311.81 | bwd_inner_microstep: 311.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:52:56,228] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 311.56 | bwd_inner_microstep: 311.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:52:59,258] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.65 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 22:52:59,259] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.03 | bwd_microstep: 1715.74 | bwd_inner_microstep: 117.06 | bwd_allreduce_microstep: 1598.64 | step_microstep: 1270.33 +[2025-04-26 22:52:59,260] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5254.65 | bwd: 11118.93 | bwd_inner: 9519.83 | bwd_allreduce: 1598.76 | step: 1271.45 + 79%|███████▊ | 243/309 [1:11:31<19:15, 17.51s/it] {'loss': 0.2097, 'learning_rate': 4.619244906145734e-06, 'epoch': 2.34} + 79%|███████▊ | 243/309 [1:11:31<19:15, 17.51s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:52:59,731] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 294.71 | bwd_inner_microstep: 294.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:53:00,219] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:00,706] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:53:00,838] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.87 | bwd_microstep: 86.83 | bwd_inner_microstep: 86.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:53:01,324] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:53:01,812] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 311.08 | bwd_inner_microstep: 311.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:02,300] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:02,787] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 310.03 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:03,274] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 22:53:03,404] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.92 | bwd_microstep: 86.38 | bwd_inner_microstep: 86.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:03,891] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:53:04,378] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:04,866] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 310.61 | bwd_inner_microstep: 310.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:05,353] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:53:05,840] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:53:06,327] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 309.79 | bwd_inner_microstep: 309.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:53:06,815] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 309.96 | bwd_inner_microstep: 309.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:53:07,302] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:53:07,789] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 309.90 | bwd_inner_microstep: 309.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:53:08,275] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 309.30 | bwd_inner_microstep: 309.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:53:08,762] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 309.95 | bwd_inner_microstep: 309.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:53:09,248] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 308.93 | bwd_inner_microstep: 308.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1929 +[2025-04-26 22:53:09,754] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.64 | bwd_microstep: 323.49 | bwd_inner_microstep: 323.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:53:10,244] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.87 | bwd_microstep: 311.52 | bwd_inner_microstep: 311.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 22:53:10,749] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.07 | bwd_microstep: 322.45 | bwd_inner_microstep: 322.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:53:11,237] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 311.18 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:53:11,727] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.32 | bwd_microstep: 310.91 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:53:12,216] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 311.53 | bwd_inner_microstep: 311.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:53:12,706] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.06 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:53:13,195] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 311.57 | bwd_inner_microstep: 311.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:53:13,685] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.97 | bwd_microstep: 311.75 | bwd_inner_microstep: 311.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:53:16,593] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.71 | optimizer_gradients: 17.53 | optimizer_step: 32.04 +[2025-04-26 22:53:16,593] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 1459.87 | bwd_inner_microstep: 340.37 | bwd_allreduce_microstep: 1119.45 | step_microstep: 1271.48 +[2025-04-26 22:53:16,595] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5258.90 | bwd: 10648.98 | bwd_inner: 9529.06 | bwd_allreduce: 1119.58 | step: 1272.56 + 79%|███████▉ | 244/309 [1:11:49<18:54, 17.46s/it] {'loss': 0.2101, 'learning_rate': 4.4857741859116024e-06, 'epoch': 2.35} + 79%|███████▉ | 244/309 [1:11:49<18:54, 17.46s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:53:17,067] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 294.95 | bwd_inner_microstep: 294.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 886 +[2025-04-26 22:53:17,320] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.12 | bwd_microstep: 162.35 | bwd_inner_microstep: 162.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:17,806] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.92 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:53:18,295] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.19 | bwd_microstep: 312.90 | bwd_inner_microstep: 312.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:18,783] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1908 +[2025-04-26 22:53:19,269] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:53:19,758] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.33 | bwd_microstep: 311.21 | bwd_inner_microstep: 311.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:53:20,246] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:53:20,734] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:53:20,865] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.42 | bwd_microstep: 86.89 | bwd_inner_microstep: 86.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:53:21,351] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 310.84 | bwd_inner_microstep: 310.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:21,839] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 311.39 | bwd_inner_microstep: 311.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:53:22,326] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 309.71 | bwd_inner_microstep: 309.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:22,813] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 886 +[2025-04-26 22:53:23,067] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.51 | bwd_microstep: 162.64 | bwd_inner_microstep: 162.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:53:23,553] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 311.47 | bwd_inner_microstep: 311.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:24,041] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 311.03 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:24,529] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 311.09 | bwd_inner_microstep: 311.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:53:25,018] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:53:25,507] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 311.70 | bwd_inner_microstep: 311.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:53:25,995] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:53:26,480] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 309.10 | bwd_inner_microstep: 309.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1964 +[2025-04-26 22:53:26,990] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.14 | bwd_microstep: 326.32 | bwd_inner_microstep: 326.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 22:53:27,496] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.65 | bwd_microstep: 322.82 | bwd_inner_microstep: 322.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:53:27,985] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 311.20 | bwd_inner_microstep: 311.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:53:28,473] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:53:28,962] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.32 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:53:29,451] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 311.30 | bwd_inner_microstep: 311.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:53:29,940] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 311.82 | bwd_inner_microstep: 311.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:53:30,428] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 885 +[2025-04-26 22:53:30,681] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.60 | bwd_microstep: 162.92 | bwd_inner_microstep: 162.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:53:33,483] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.61 | optimizer_gradients: 17.55 | optimizer_step: 32.04 +[2025-04-26 22:53:33,483] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 1356.74 | bwd_inner_microstep: 338.96 | bwd_allreduce_microstep: 1017.74 | step_microstep: 1270.40 +[2025-04-26 22:53:33,485] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5130.81 | bwd: 10336.99 | bwd_inner: 9318.78 | bwd_allreduce: 1017.85 | step: 1271.46 + 79%|███████▉ | 245/309 [1:12:06<18:26, 17.29s/it] {'loss': 0.2218, 'learning_rate': 4.354016174290572e-06, 'epoch': 2.36} + 79%|███████▉ | 245/309 [1:12:06<18:26, 17.29s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:33,955] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 294.11 | bwd_inner_microstep: 294.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 22:53:34,086] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.29 | bwd_microstep: 87.10 | bwd_inner_microstep: 87.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:34,574] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 313.23 | bwd_inner_microstep: 313.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:53:35,062] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 310.86 | bwd_inner_microstep: 310.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:53:35,550] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 311.73 | bwd_inner_microstep: 311.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:36,038] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.49 | bwd_microstep: 310.27 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:53:36,526] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 310.86 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:37,016] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.24 | bwd_microstep: 311.16 | bwd_inner_microstep: 311.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:37,503] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 310.33 | bwd_inner_microstep: 310.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:37,991] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:38,479] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.82 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:38,966] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:53:39,454] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.62 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:39,942] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:53:40,430] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:40,918] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.73 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:53:41,405] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:53:41,893] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.97 | bwd_microstep: 310.73 | bwd_inner_microstep: 310.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:53:42,380] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 309.92 | bwd_inner_microstep: 309.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:53:42,868] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 358 +[2025-04-26 22:53:42,997] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.89 | bwd_microstep: 85.12 | bwd_inner_microstep: 85.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1977 +[2025-04-26 22:53:43,507] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.30 | bwd_microstep: 327.79 | bwd_inner_microstep: 327.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2010 +[2025-04-26 22:53:44,022] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.26 | bwd_microstep: 331.40 | bwd_inner_microstep: 331.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 926 +[2025-04-26 22:53:44,291] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 91.82 | bwd_microstep: 173.38 | bwd_inner_microstep: 173.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 915 +[2025-04-26 22:53:44,559] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 91.30 | bwd_microstep: 172.71 | bwd_inner_microstep: 172.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1923 +[2025-04-26 22:53:45,064] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.17 | bwd_microstep: 323.50 | bwd_inner_microstep: 323.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:53:45,553] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 311.87 | bwd_inner_microstep: 311.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:53:46,043] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.07 | bwd_microstep: 311.01 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:53:46,175] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.57 | bwd_microstep: 87.06 | bwd_inner_microstep: 87.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:53:46,663] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 312.98 | bwd_inner_microstep: 312.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:53:47,152] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 312.62 | bwd_inner_microstep: 312.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:53:50,351] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.84 | optimizer_gradients: 17.51 | optimizer_step: 32.01 +[2025-04-26 22:53:50,352] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 1749.42 | bwd_inner_microstep: 338.50 | bwd_allreduce_microstep: 1410.89 | step_microstep: 1272.61 +[2025-04-26 22:53:50,353] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4976.82 | bwd: 10474.24 | bwd_inner: 9062.89 | bwd_allreduce: 1411.01 | step: 1273.65 + 80%|███████▉ | 246/309 [1:12:23<18:01, 17.16s/it] {'loss': 0.3078, 'learning_rate': 4.223985416841292e-06, 'epoch': 2.37} + 80%|███████▉ | 246/309 [1:12:23<18:01, 17.16s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:50,824] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 295.10 | bwd_inner_microstep: 295.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:51,311] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:53:51,798] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.80 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:53:52,288] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.19 | bwd_microstep: 311.15 | bwd_inner_microstep: 311.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:52,775] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 310.42 | bwd_inner_microstep: 310.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:53:53,263] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:53:53,751] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:54,239] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 310.43 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:54,726] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.53 | bwd_inner_microstep: 310.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:55,214] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:55,704] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.37 | bwd_microstep: 311.60 | bwd_inner_microstep: 311.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:56,192] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.95 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:53:56,680] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:53:57,170] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.48 | bwd_microstep: 311.32 | bwd_inner_microstep: 311.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:53:57,658] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:53:58,145] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:53:58,634] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.56 | bwd_microstep: 310.93 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:53:59,122] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 310.74 | bwd_inner_microstep: 310.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:53:59,609] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:54:00,097] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:54:00,585] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:54:01,072] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 309.49 | bwd_inner_microstep: 309.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1964 +[2025-04-26 22:54:01,583] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.72 | bwd_microstep: 326.77 | bwd_inner_microstep: 326.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:54:02,073] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.76 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:54:02,562] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.51 | bwd_microstep: 311.16 | bwd_inner_microstep: 311.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1908 +[2025-04-26 22:54:03,051] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.37 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:54:03,539] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:54:04,029] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.20 | bwd_microstep: 312.30 | bwd_inner_microstep: 312.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:54:04,517] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:54:05,007] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.27 | bwd_microstep: 311.87 | bwd_inner_microstep: 311.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 377 +[2025-04-26 22:54:05,139] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.10 | bwd_microstep: 87.15 | bwd_inner_microstep: 87.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:54:08,316] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.57 | optimizer_gradients: 17.51 | optimizer_step: 32.03 +[2025-04-26 22:54:08,317] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 1731.27 | bwd_inner_microstep: 340.91 | bwd_allreduce_microstep: 1390.32 | step_microstep: 1270.33 +[2025-04-26 22:54:08,318] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5394.88 | bwd: 11143.19 | bwd_inner: 9752.41 | bwd_allreduce: 1390.44 | step: 1271.41 + 80%|███████▉ | 247/309 [1:12:40<17:59, 17.40s/it] {'loss': 0.2115, 'learning_rate': 4.095696268440426e-06, 'epoch': 2.38} + 80%|███████▉ | 247/309 [1:12:40<17:59, 17.40s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:54:08,789] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 294.36 | bwd_inner_microstep: 294.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:54:09,278] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 312.50 | bwd_inner_microstep: 312.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:54:09,408] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.24 | bwd_microstep: 86.44 | bwd_inner_microstep: 86.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:54:09,538] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.13 | bwd_microstep: 86.65 | bwd_inner_microstep: 86.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:54:10,024] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:10,513] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 311.98 | bwd_inner_microstep: 311.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:54:11,000] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:54:11,489] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:54:11,977] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.27 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:54:12,108] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.80 | bwd_microstep: 86.48 | bwd_inner_microstep: 86.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:54:12,594] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:54:13,083] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 311.50 | bwd_inner_microstep: 311.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:13,571] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 310.43 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:14,059] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:54:14,548] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 311.21 | bwd_inner_microstep: 311.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:54:15,037] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:54:15,525] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:54:16,012] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:54:16,500] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 878 +[2025-04-26 22:54:16,753] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.38 | bwd_microstep: 162.02 | bwd_inner_microstep: 162.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:54:17,240] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:54:17,726] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 309.35 | bwd_inner_microstep: 309.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:54:18,231] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.06 | bwd_microstep: 323.32 | bwd_inner_microstep: 323.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:54:18,738] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.59 | bwd_microstep: 323.48 | bwd_inner_microstep: 323.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:54:19,227] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 310.85 | bwd_inner_microstep: 310.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:19,715] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 311.72 | bwd_inner_microstep: 311.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:54:20,203] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 311.09 | bwd_inner_microstep: 311.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:54:20,694] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.68 | bwd_microstep: 311.97 | bwd_inner_microstep: 311.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:54:21,184] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.77 | bwd_microstep: 311.28 | bwd_inner_microstep: 311.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:54:21,671] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 311.24 | bwd_inner_microstep: 311.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:54:22,161] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.45 | bwd_microstep: 311.28 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:54:25,464] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1205.29 | optimizer_gradients: 17.51 | optimizer_step: 32.03 +[2025-04-26 22:54:25,464] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 1853.00 | bwd_inner_microstep: 340.35 | bwd_allreduce_microstep: 1512.61 | step_microstep: 1272.97 +[2025-04-26 22:54:25,466] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5037.72 | bwd: 10678.55 | bwd_inner: 9165.47 | bwd_allreduce: 1512.73 | step: 1274.11 + 80%|████████ | 248/309 [1:12:58<17:36, 17.33s/it] {'loss': 0.1939, 'learning_rate': 3.969162891697962e-06, 'epoch': 2.39} + 80%|████████ | 248/309 [1:12:58<17:36, 17.33s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:54:25,937] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 294.81 | bwd_inner_microstep: 294.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:54:26,425] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 311.91 | bwd_inner_microstep: 311.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:54:26,911] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:54:27,398] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:54:27,888] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.71 | bwd_microstep: 310.99 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:54:28,375] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:54:28,862] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:54:29,350] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.90 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:29,837] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 309.98 | bwd_inner_microstep: 309.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:30,325] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.00 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:30,812] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.77 | bwd_microstep: 310.21 | bwd_inner_microstep: 310.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:54:31,301] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.60 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 881 +[2025-04-26 22:54:31,553] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.91 | bwd_microstep: 162.55 | bwd_inner_microstep: 162.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:54:32,042] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 312.81 | bwd_inner_microstep: 312.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:32,530] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 311.38 | bwd_inner_microstep: 311.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:33,018] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.77 | bwd_microstep: 311.07 | bwd_inner_microstep: 311.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:33,506] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:54:33,993] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.06 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:54:34,481] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 310.18 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1895 +[2025-04-26 22:54:34,968] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.08 | bwd_microstep: 309.47 | bwd_inner_microstep: 309.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:54:35,455] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 309.34 | bwd_inner_microstep: 309.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:54:35,942] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 309.62 | bwd_inner_microstep: 309.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 22:54:36,447] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.04 | bwd_microstep: 322.86 | bwd_inner_microstep: 322.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:54:36,936] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:54:37,425] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.34 | bwd_microstep: 311.73 | bwd_inner_microstep: 311.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:54:37,913] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:54:38,402] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.46 | bwd_microstep: 310.84 | bwd_inner_microstep: 310.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:54:38,893] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.63 | bwd_microstep: 312.01 | bwd_inner_microstep: 312.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:54:39,381] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 310.97 | bwd_inner_microstep: 310.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:54:39,869] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 311.22 | bwd_inner_microstep: 311.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:40,359] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.91 | bwd_microstep: 311.11 | bwd_inner_microstep: 311.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:54:43,073] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.34 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 22:54:43,074] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 1267.29 | bwd_inner_microstep: 341.28 | bwd_allreduce_microstep: 925.97 | step_microstep: 1270.26 +[2025-04-26 22:54:43,075] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5440.87 | bwd: 10747.64 | bwd_inner: 9821.21 | bwd_allreduce: 926.09 | step: 1271.41 + 81%|████████ | 249/309 [1:13:15<17:24, 17.41s/it] {'loss': 0.1181, 'learning_rate': 3.844399255393705e-06, 'epoch': 2.4} + 81%|████████ | 249/309 [1:13:15<17:24, 17.41s/it]dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 377 +[2025-04-26 22:54:43,188] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 36.58 | bwd_microstep: 69.94 | bwd_inner_microstep: 69.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:54:43,675] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:54:44,163] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 312.51 | bwd_inner_microstep: 312.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:54:44,653] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.22 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:54:45,141] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:54:45,628] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:46,116] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 310.75 | bwd_inner_microstep: 310.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:46,603] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:47,090] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:47,578] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:48,066] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:54:48,555] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 310.97 | bwd_inner_microstep: 310.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:54:49,043] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 310.43 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1399 +[2025-04-26 22:54:49,416] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.70 | bwd_microstep: 239.61 | bwd_inner_microstep: 239.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1400 +[2025-04-26 22:54:49,789] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 127.88 | bwd_microstep: 240.71 | bwd_inner_microstep: 240.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:54:50,276] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:54:50,763] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 311.00 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:54:51,251] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:54:51,737] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 309.95 | bwd_inner_microstep: 309.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:54:52,224] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 309.33 | bwd_inner_microstep: 309.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:54:52,710] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 308.88 | bwd_inner_microstep: 308.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:54:53,198] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 22:54:53,706] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.54 | bwd_microstep: 324.36 | bwd_inner_microstep: 324.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:54:54,195] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.35 | bwd_microstep: 311.66 | bwd_inner_microstep: 311.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:54:54,701] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.28 | bwd_microstep: 323.21 | bwd_inner_microstep: 323.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:54:55,191] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.20 | bwd_microstep: 311.69 | bwd_inner_microstep: 311.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:54:55,679] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.59 | bwd_microstep: 311.24 | bwd_inner_microstep: 311.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:54:56,169] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.21 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:54:56,660] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.66 | bwd_microstep: 312.37 | bwd_inner_microstep: 312.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:54:57,148] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.90 | bwd_microstep: 311.28 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:54:57,637] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.87 | bwd_microstep: 311.02 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1400 +[2025-04-26 22:55:00,533] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.57 | optimizer_gradients: 17.52 | optimizer_step: 32.05 +[2025-04-26 22:55:00,534] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.44 | bwd_microstep: 1490.14 | bwd_inner_microstep: 268.51 | bwd_allreduce_microstep: 1221.60 | step_microstep: 1272.34 +[2025-04-26 22:55:00,535] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5263.30 | bwd: 10768.05 | bwd_inner: 9545.98 | bwd_allreduce: 1221.72 | step: 1273.38 + 81%|████████ | 250/309 [1:13:33<17:08, 17.43s/it] {'loss': 0.2248, 'learning_rate': 3.7214191329351735e-06, 'epoch': 2.41} + 81%|████████ | 250/309 [1:13:33<17:08, 17.43s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:01,006] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 295.07 | bwd_inner_microstep: 295.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:55:01,495] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:01,982] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 310.83 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:02,471] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.13 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:55:02,959] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.59 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 885 +[2025-04-26 22:55:03,212] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.10 | bwd_microstep: 162.84 | bwd_inner_microstep: 162.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:55:03,699] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 311.43 | bwd_inner_microstep: 311.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:55:04,187] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 311.53 | bwd_inner_microstep: 311.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:55:04,675] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 310.19 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:05,162] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:55:05,650] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.23 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:55:06,139] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 311.50 | bwd_inner_microstep: 311.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:06,630] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:07,118] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.50 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:07,607] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:08,094] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 310.40 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:08,584] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 311.36 | bwd_inner_microstep: 311.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:55:09,071] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 309.72 | bwd_inner_microstep: 309.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:55:09,559] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 310.40 | bwd_inner_microstep: 310.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:55:10,047] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:55:10,534] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 309.40 | bwd_inner_microstep: 309.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:55:11,020] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 309.37 | bwd_inner_microstep: 309.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:55:11,510] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:55:11,998] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.99 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:55:12,487] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 311.84 | bwd_inner_microstep: 311.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:55:12,977] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 311.64 | bwd_inner_microstep: 311.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:55:13,466] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 311.25 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:13,954] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 310.83 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:55:14,444] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.29 | bwd_microstep: 311.67 | bwd_inner_microstep: 311.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:55:14,933] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 311.58 | bwd_inner_microstep: 311.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:55:15,421] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 311.23 | bwd_inner_microstep: 311.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:55:17,880] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.85 | optimizer_gradients: 17.54 | optimizer_step: 32.03 +[2025-04-26 22:55:17,881] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 1012.25 | bwd_inner_microstep: 622.73 | bwd_allreduce_microstep: 389.49 | step_microstep: 1269.72 +[2025-04-26 22:55:17,882] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5427.04 | bwd: 10485.62 | bwd_inner: 10095.67 | bwd_allreduce: 389.61 | step: 1270.88 + 81%|████████ | 251/309 [1:13:50<16:49, 17.40s/it] {'loss': 0.1661, 'learning_rate': 3.6002361008370802e-06, 'epoch': 2.42} + 81%|████████ | 251/309 [1:13:50<16:49, 17.40s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:18,355] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 295.29 | bwd_inner_microstep: 295.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:55:18,843] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.95 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:55:18,974] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.81 | bwd_microstep: 86.94 | bwd_inner_microstep: 86.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:55:19,461] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 312.63 | bwd_inner_microstep: 312.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:19,950] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1907 +[2025-04-26 22:55:20,437] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:20,926] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:21,415] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:55:21,904] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.17 | bwd_microstep: 311.92 | bwd_inner_microstep: 311.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:22,392] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:55:22,880] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 310.62 | bwd_inner_microstep: 310.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:55:23,368] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:23,856] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:24,344] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 310.21 | bwd_inner_microstep: 310.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:55:24,833] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 310.61 | bwd_inner_microstep: 310.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:55:25,321] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.15 | bwd_microstep: 309.99 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:55:25,809] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:55:26,298] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.29 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:55:26,428] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.78 | bwd_microstep: 85.89 | bwd_inner_microstep: 85.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 869 +[2025-04-26 22:55:26,677] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 83.36 | bwd_microstep: 161.18 | bwd_inner_microstep: 161.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:55:27,163] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 355 +[2025-04-26 22:55:27,289] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.72 | bwd_microstep: 83.49 | bwd_inner_microstep: 83.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 22:55:27,796] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.77 | bwd_microstep: 324.84 | bwd_inner_microstep: 324.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1939 +[2025-04-26 22:55:28,303] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.65 | bwd_microstep: 324.66 | bwd_inner_microstep: 324.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:55:28,791] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:55:29,280] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.55 | bwd_microstep: 310.86 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 22:55:29,412] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.11 | bwd_microstep: 87.23 | bwd_inner_microstep: 87.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 22:55:29,918] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.38 | bwd_microstep: 323.71 | bwd_inner_microstep: 323.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:55:30,407] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 312.07 | bwd_inner_microstep: 312.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:30,896] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:55:31,385] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.97 | bwd_microstep: 311.56 | bwd_inner_microstep: 311.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:55:35,859] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.39 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 22:55:35,860] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 3024.65 | bwd_inner_microstep: 338.24 | bwd_allreduce_microstep: 2686.38 | step_microstep: 1272.10 +[2025-04-26 22:55:35,861] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4908.60 | bwd: 11638.46 | bwd_inner: 8951.61 | bwd_allreduce: 2686.50 | step: 1273.24 + 82%|████████▏ | 252/309 [1:14:08<16:41, 17.58s/it] {'loss': 0.1613, 'learning_rate': 3.4808635372225276e-06, 'epoch': 2.43} + 82%|████████▏ | 252/309 [1:14:08<16:41, 17.58s/it]dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 887 +[2025-04-26 22:55:36,096] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 83.06 | bwd_microstep: 145.99 | bwd_inner_microstep: 145.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:55:36,583] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:37,070] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 310.14 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:37,559] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:55:38,047] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:38,534] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:39,022] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.87 | bwd_microstep: 310.26 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:55:39,509] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 309.62 | bwd_inner_microstep: 309.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:55:39,994] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 309.69 | bwd_inner_microstep: 309.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:55:40,482] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:40,969] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:41,459] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.73 | bwd_microstep: 311.47 | bwd_inner_microstep: 311.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:55:41,946] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:55:42,434] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:42,923] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.44 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:43,411] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 310.26 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:55:43,899] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 309.89 | bwd_inner_microstep: 309.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:55:44,029] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.17 | bwd_microstep: 85.91 | bwd_inner_microstep: 85.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:55:44,157] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.71 | bwd_microstep: 85.86 | bwd_inner_microstep: 85.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:55:44,642] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 309.42 | bwd_inner_microstep: 309.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 867 +[2025-04-26 22:55:44,893] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.70 | bwd_microstep: 161.20 | bwd_inner_microstep: 161.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1954 +[2025-04-26 22:55:45,402] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.23 | bwd_microstep: 325.71 | bwd_inner_microstep: 325.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1988 +[2025-04-26 22:55:45,915] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.07 | bwd_microstep: 329.93 | bwd_inner_microstep: 329.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:55:46,404] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 311.23 | bwd_inner_microstep: 311.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 22:55:46,909] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.80 | bwd_microstep: 322.41 | bwd_inner_microstep: 322.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1923 +[2025-04-26 22:55:47,415] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.08 | bwd_microstep: 322.76 | bwd_inner_microstep: 322.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:55:47,904] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.77 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:55:48,393] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:55:48,882] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.16 | bwd_microstep: 311.11 | bwd_inner_microstep: 311.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:55:49,370] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:55:49,860] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.39 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:55:53,575] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.14 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 22:55:53,576] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 2269.76 | bwd_inner_microstep: 339.64 | bwd_allreduce_microstep: 1930.09 | step_microstep: 1268.87 +[2025-04-26 22:55:53,577] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5093.75 | bwd: 11190.80 | bwd_inner: 9260.25 | bwd_allreduce: 1930.21 | step: 1270.02 + 82%|████████▏ | 253/309 [1:14:26<16:26, 17.62s/it] {'loss': 0.226, 'learning_rate': 3.3633146203461275e-06, 'epoch': 2.44} + 82%|████████▏ | 253/309 [1:14:26<16:26, 17.62s/it]dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1400 +[2025-04-26 22:55:53,932] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 125.82 | bwd_microstep: 223.32 | bwd_inner_microstep: 223.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:55:54,418] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:55:54,906] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:55:55,393] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:55:55,881] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 310.12 | bwd_inner_microstep: 310.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 22:55:56,012] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.19 | bwd_microstep: 86.13 | bwd_inner_microstep: 86.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:55:56,498] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:55:56,986] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:55:57,473] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 310.05 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:57,962] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 311.13 | bwd_inner_microstep: 311.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:55:58,450] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:55:58,937] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 309.93 | bwd_inner_microstep: 309.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:59,425] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.31 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:55:59,914] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.00 | bwd_microstep: 309.99 | bwd_inner_microstep: 309.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:56:00,401] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:00,891] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.59 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:56:01,379] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 880 +[2025-04-26 22:56:01,632] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.00 | bwd_microstep: 162.39 | bwd_inner_microstep: 162.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 4, images per sample: 4.0, dynamic token length: 1135 +[2025-04-26 22:56:01,933] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 104.26 | bwd_microstep: 191.79 | bwd_inner_microstep: 191.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:56:02,420] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:56:02,906] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 310.12 | bwd_inner_microstep: 310.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:56:03,393] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 308.98 | bwd_inner_microstep: 308.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1964 +[2025-04-26 22:56:03,902] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.27 | bwd_microstep: 326.18 | bwd_inner_microstep: 326.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1410 +[2025-04-26 22:56:04,281] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 130.27 | bwd_microstep: 243.07 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:56:04,768] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 311.30 | bwd_inner_microstep: 311.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:56:05,257] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 311.52 | bwd_inner_microstep: 311.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:56:05,746] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 311.60 | bwd_inner_microstep: 311.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:06,234] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:06,722] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 310.84 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:07,211] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.95 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:07,699] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:56:11,090] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1205.16 | optimizer_gradients: 17.56 | optimizer_step: 32.03 +[2025-04-26 22:56:11,091] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 1940.88 | bwd_inner_microstep: 339.97 | bwd_allreduce_microstep: 1600.87 | step_microstep: 1273.11 +[2025-04-26 22:56:11,092] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5139.84 | bwd: 10937.78 | bwd_inner: 9336.44 | bwd_allreduce: 1600.99 | step: 1274.30 + 82%|████████▏ | 254/309 [1:14:43<16:07, 17.59s/it] {'loss': 0.1599, 'learning_rate': 3.2476023271391698e-06, 'epoch': 2.45} + 82%|████████▏ | 254/309 [1:14:43<16:07, 17.59s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:56:11,562] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 294.63 | bwd_inner_microstep: 294.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:12,049] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:12,535] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:13,022] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.73 | bwd_microstep: 309.83 | bwd_inner_microstep: 309.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:13,510] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:56:13,996] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 309.79 | bwd_inner_microstep: 309.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:14,483] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:56:14,969] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 309.14 | bwd_inner_microstep: 309.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:15,457] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:56:15,944] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 309.54 | bwd_inner_microstep: 309.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:56:16,431] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 310.83 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:56:16,918] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:56:17,405] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 309.98 | bwd_inner_microstep: 309.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:56:17,893] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:18,379] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:56:18,866] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 309.79 | bwd_inner_microstep: 309.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:56:19,354] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 310.04 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:56:19,841] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:56:20,328] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 309.67 | bwd_inner_microstep: 309.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:56:20,816] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:56:21,302] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 308.97 | bwd_inner_microstep: 308.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:56:21,787] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 309.04 | bwd_inner_microstep: 309.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 387 +[2025-04-26 22:56:21,927] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.36 | bwd_microstep: 93.95 | bwd_inner_microstep: 93.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:56:22,414] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 311.57 | bwd_inner_microstep: 311.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:56:22,902] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 311.36 | bwd_inner_microstep: 311.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:23,391] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:23,879] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 311.10 | bwd_inner_microstep: 311.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:56:24,367] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.35 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:56:24,855] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:56:25,344] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.29 | bwd_microstep: 310.93 | bwd_inner_microstep: 310.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:56:25,832] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 311.03 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:56:28,294] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.62 | optimizer_gradients: 17.50 | optimizer_step: 32.04 +[2025-04-26 22:56:28,294] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 1013.97 | bwd_inner_microstep: 625.15 | bwd_allreduce_microstep: 388.78 | step_microstep: 1271.39 +[2025-04-26 22:56:28,296] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5381.56 | bwd: 10400.13 | bwd_inner: 10010.89 | bwd_allreduce: 388.90 | step: 1272.45 + 83%|████████▎ | 255/309 [1:15:00<15:43, 17.47s/it] {'loss': 0.1809, 'learning_rate': 3.1337394317770208e-06, 'epoch': 2.46} + 83%|████████▎ | 255/309 [1:15:00<15:43, 17.47s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:56:28,768] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 295.53 | bwd_inner_microstep: 295.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:56:29,256] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 309.97 | bwd_inner_microstep: 309.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:56:29,742] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:56:30,231] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.51 | bwd_microstep: 310.40 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:56:30,718] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:31,207] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.07 | bwd_microstep: 310.53 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:56:31,695] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:56:31,826] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.15 | bwd_microstep: 87.20 | bwd_inner_microstep: 87.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:32,312] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 310.75 | bwd_inner_microstep: 310.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:56:32,800] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:56:33,288] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 310.58 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:33,775] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 310.45 | bwd_inner_microstep: 310.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:56:34,264] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.08 | bwd_microstep: 310.38 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:56:34,751] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:56:35,239] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:35,727] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:56:36,213] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 309.85 | bwd_inner_microstep: 309.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:56:36,700] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.31 | bwd_microstep: 309.81 | bwd_inner_microstep: 309.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:56:37,187] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 309.78 | bwd_inner_microstep: 309.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:56:37,675] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 310.05 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:56:38,161] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.87 | bwd_microstep: 308.98 | bwd_inner_microstep: 308.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:56:38,649] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.22 | bwd_microstep: 309.93 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1953 +[2025-04-26 22:56:39,158] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.86 | bwd_microstep: 325.33 | bwd_inner_microstep: 325.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:56:39,647] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 311.80 | bwd_inner_microstep: 311.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 22:56:40,155] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.75 | bwd_microstep: 323.58 | bwd_inner_microstep: 323.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:56:40,644] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.24 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:41,133] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 311.46 | bwd_inner_microstep: 311.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:56:41,622] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 312.13 | bwd_inner_microstep: 312.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:56:42,110] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 310.53 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:56:42,599] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 311.38 | bwd_inner_microstep: 311.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:43,088] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.64 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:56:46,019] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.54 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 22:56:46,019] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 1481.78 | bwd_inner_microstep: 340.12 | bwd_allreduce_microstep: 1141.63 | step_microstep: 1272.18 +[2025-04-26 22:56:46,021] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5401.76 | bwd: 10898.06 | bwd_inner: 9755.97 | bwd_allreduce: 1141.75 | step: 1273.22 + 83%|████████▎ | 256/309 [1:15:18<15:30, 17.55s/it] {'loss': 0.1874, 'learning_rate': 3.021738504268905e-06, 'epoch': 2.47} + 83%|████████▎ | 256/309 [1:15:18<15:30, 17.55s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:56:46,490] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 293.63 | bwd_inner_microstep: 293.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:46,978] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 312.12 | bwd_inner_microstep: 312.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:56:47,465] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.87 | bwd_microstep: 309.57 | bwd_inner_microstep: 309.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:56:47,952] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:48,441] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.59 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:56:48,928] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.63 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:56:49,416] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.04 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:49,903] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:56:50,392] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.92 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:56:50,523] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.55 | bwd_microstep: 86.76 | bwd_inner_microstep: 86.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:56:51,008] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:56:51,496] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:56:51,983] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:52,471] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:56:52,959] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.26 | bwd_inner_microstep: 310.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:56:53,448] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.26 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:56:53,937] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.22 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:56:54,425] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:56:54,913] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 878 +[2025-04-26 22:56:55,165] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.40 | bwd_microstep: 161.81 | bwd_inner_microstep: 161.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:56:55,294] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.84 | bwd_microstep: 85.66 | bwd_inner_microstep: 85.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:56:55,778] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 308.98 | bwd_inner_microstep: 308.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1940 +[2025-04-26 22:56:56,286] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.96 | bwd_microstep: 324.97 | bwd_inner_microstep: 324.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1927 +[2025-04-26 22:56:56,792] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.91 | bwd_microstep: 323.21 | bwd_inner_microstep: 323.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:57,281] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:56:57,412] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.14 | bwd_microstep: 86.69 | bwd_inner_microstep: 86.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:57,900] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 311.92 | bwd_inner_microstep: 311.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:58,388] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 311.35 | bwd_inner_microstep: 311.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:56:58,877] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 311.20 | bwd_inner_microstep: 311.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:56:59,365] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:56:59,855] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 311.25 | bwd_inner_microstep: 311.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:57:03,224] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.43 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 22:57:03,224] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 1921.75 | bwd_inner_microstep: 340.11 | bwd_allreduce_microstep: 1581.60 | step_microstep: 1270.13 +[2025-04-26 22:57:03,226] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5034.38 | bwd: 10741.32 | bwd_inner: 9159.26 | bwd_allreduce: 1581.72 | step: 1271.24 + 83%|████████▎ | 257/309 [1:15:35<15:07, 17.44s/it] {'loss': 0.1993, 'learning_rate': 2.911611909070229e-06, 'epoch': 2.47} + 83%|████████▎ | 257/309 [1:15:35<15:07, 17.44s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:57:03,697] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 294.62 | bwd_inner_microstep: 294.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 887 +[2025-04-26 22:57:03,951] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.13 | bwd_microstep: 162.48 | bwd_inner_microstep: 162.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:57:04,438] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 311.31 | bwd_inner_microstep: 311.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 888 +[2025-04-26 22:57:04,691] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.80 | bwd_microstep: 162.58 | bwd_inner_microstep: 162.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:57:05,177] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 311.19 | bwd_inner_microstep: 311.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:57:05,665] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:57:06,153] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:57:06,641] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 310.40 | bwd_inner_microstep: 310.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:57:07,129] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:57:07,616] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 309.38 | bwd_inner_microstep: 309.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:57:08,103] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:08,592] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.12 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1907 +[2025-04-26 22:57:09,081] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.59 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:09,569] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.18 | bwd_microstep: 310.33 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:10,058] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 22:57:10,434] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.17 | bwd_microstep: 240.25 | bwd_inner_microstep: 240.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:57:10,921] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:57:11,608] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 371.99 | bwd_microstep: 309.98 | bwd_inner_microstep: 309.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:57:12,095] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:57:12,583] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:57:13,070] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 22:57:13,557] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 309.56 | bwd_inner_microstep: 309.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 940 +[2025-04-26 22:57:13,829] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 92.47 | bwd_microstep: 175.14 | bwd_inner_microstep: 175.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:57:14,334] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.11 | bwd_microstep: 323.65 | bwd_inner_microstep: 323.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:57:14,823] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 311.64 | bwd_inner_microstep: 311.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:57:15,312] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.19 | bwd_microstep: 311.25 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 22:57:15,444] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 41.13 | bwd_microstep: 87.00 | bwd_inner_microstep: 86.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:57:15,932] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 311.52 | bwd_inner_microstep: 311.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:57:16,421] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 312.47 | bwd_inner_microstep: 312.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:57:16,911] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 311.28 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:57:17,399] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.85 | bwd_inner_microstep: 310.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:57:19,861] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.38 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 22:57:19,861] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.25 | bwd_microstep: 1011.55 | bwd_inner_microstep: 622.19 | bwd_allreduce_microstep: 389.32 | step_microstep: 1272.11 +[2025-04-26 22:57:19,863] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5283.83 | bwd: 9914.32 | bwd_inner: 9524.53 | bwd_allreduce: 389.44 | step: 1273.29 + 83%|████████▎ | 258/309 [1:15:52<14:37, 17.20s/it] {'loss': 0.2099, 'learning_rate': 2.8033718037175915e-06, 'epoch': 2.48} + 83%|████████▎ | 258/309 [1:15:52<14:37, 17.20s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:20,333] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 293.73 | bwd_inner_microstep: 293.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:57:20,823] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.77 | bwd_microstep: 312.76 | bwd_inner_microstep: 312.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:57:21,311] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 378 +[2025-04-26 22:57:21,441] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.85 | bwd_microstep: 86.61 | bwd_inner_microstep: 86.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:57:21,928] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 311.92 | bwd_inner_microstep: 311.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:57:22,415] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:22,903] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:57:23,391] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.92 | bwd_microstep: 310.13 | bwd_inner_microstep: 310.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:23,878] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 310.21 | bwd_inner_microstep: 310.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:57:24,367] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:57:24,855] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 310.73 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:57:25,341] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:25,832] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.55 | bwd_microstep: 311.45 | bwd_inner_microstep: 311.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:57:26,320] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:57:26,807] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 309.93 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:27,296] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.17 | bwd_microstep: 310.83 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:27,784] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.27 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 888 +[2025-04-26 22:57:28,038] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.79 | bwd_microstep: 163.32 | bwd_inner_microstep: 163.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:57:28,526] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:57:29,012] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:57:29,500] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 22:57:29,987] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 309.08 | bwd_inner_microstep: 309.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2024 +[2025-04-26 22:57:30,505] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.36 | bwd_microstep: 332.99 | bwd_inner_microstep: 332.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:57:30,995] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:57:31,485] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.32 | bwd_microstep: 311.02 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:57:31,974] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 311.38 | bwd_inner_microstep: 311.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:57:32,464] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.66 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:57:32,953] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:57:33,442] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 310.99 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:57:33,930] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 310.74 | bwd_inner_microstep: 310.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:57:34,420] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.22 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:57:37,058] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.41 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 22:57:37,058] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 1189.75 | bwd_inner_microstep: 340.16 | bwd_allreduce_microstep: 849.55 | step_microstep: 1271.12 +[2025-04-26 22:57:37,060] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5303.40 | bwd: 10457.18 | bwd_inner: 9607.16 | bwd_allreduce: 849.67 | step: 1272.27 + 84%|████████▍ | 259/309 [1:16:09<14:20, 17.20s/it] {'loss': 0.1388, 'learning_rate': 2.6970301374866337e-06, 'epoch': 2.49} + 84%|████████▍ | 259/309 [1:16:09<14:20, 17.20s/it]dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1399 +[2025-04-26 22:57:37,416] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 126.94 | bwd_microstep: 223.06 | bwd_inner_microstep: 223.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:57:37,902] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 377 +[2025-04-26 22:57:38,034] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.75 | bwd_microstep: 87.02 | bwd_inner_microstep: 87.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:38,522] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 310.84 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:57:39,011] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 311.76 | bwd_inner_microstep: 311.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:39,499] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:57:39,987] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:57:40,477] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.48 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:57:40,966] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 311.18 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:57:41,454] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 310.83 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:41,942] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:57:42,430] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:42,918] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 310.42 | bwd_inner_microstep: 310.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:43,408] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 311.24 | bwd_inner_microstep: 311.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:57:43,896] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.04 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:57:44,384] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:57:44,872] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 310.26 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 22:57:45,003] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.11 | bwd_microstep: 85.90 | bwd_inner_microstep: 85.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:57:45,488] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:57:45,975] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1895 +[2025-04-26 22:57:46,463] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 309.59 | bwd_inner_microstep: 309.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:57:46,949] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 309.14 | bwd_inner_microstep: 309.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1925 +[2025-04-26 22:57:47,456] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.34 | bwd_microstep: 323.57 | bwd_inner_microstep: 323.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.08 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:57:47,947] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.29 | bwd_microstep: 311.46 | bwd_inner_microstep: 311.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:57:48,437] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 311.44 | bwd_inner_microstep: 311.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:57:48,926] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.00 | bwd_microstep: 311.28 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:57:49,058] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.61 | bwd_microstep: 86.63 | bwd_inner_microstep: 86.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:57:49,546] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 311.66 | bwd_inner_microstep: 311.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:57:50,035] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 310.83 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:57:50,524] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 311.53 | bwd_inner_microstep: 311.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1907 +[2025-04-26 22:57:51,013] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:57:55,496] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1205.15 | optimizer_gradients: 17.52 | optimizer_step: 32.05 +[2025-04-26 22:57:55,497] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 3032.42 | bwd_inner_microstep: 339.68 | bwd_allreduce_microstep: 2692.70 | step_microstep: 1272.88 +[2025-04-26 22:57:55,498] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5080.32 | bwd: 11918.09 | bwd_inner: 9224.92 | bwd_allreduce: 2692.82 | step: 1274.11 + 84%|████████▍ | 260/309 [1:16:28<14:21, 17.57s/it] {'loss': 0.2015, 'learning_rate': 2.5925986500729083e-06, 'epoch': 2.5} + 84%|████████▍ | 260/309 [1:16:28<14:21, 17.57s/it]dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1400 +[2025-04-26 22:57:55,855] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 126.74 | bwd_microstep: 223.61 | bwd_inner_microstep: 223.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:57:56,341] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:56,829] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 311.10 | bwd_inner_microstep: 311.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:57:57,317] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 310.26 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:57:57,804] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 310.31 | bwd_inner_microstep: 310.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:57:58,294] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.30 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 22:57:58,425] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.97 | bwd_microstep: 86.86 | bwd_inner_microstep: 86.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:57:58,911] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:57:59,399] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 311.66 | bwd_inner_microstep: 311.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:57:59,887] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:00,374] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 310.42 | bwd_inner_microstep: 310.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:00,862] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 310.38 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:58:01,350] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:58:01,838] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:02,325] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:02,813] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 311.31 | bwd_inner_microstep: 311.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:03,302] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 22:58:03,433] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.31 | bwd_microstep: 86.51 | bwd_inner_microstep: 86.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 22:58:03,919] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 4, images per sample: 4.0, dynamic token length: 1128 +[2025-04-26 22:58:04,219] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 104.41 | bwd_microstep: 190.68 | bwd_inner_microstep: 190.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:58:04,705] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 309.90 | bwd_inner_microstep: 309.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:58:05,191] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 309.98 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 405 +[2025-04-26 22:58:05,334] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.99 | bwd_microstep: 95.22 | bwd_inner_microstep: 95.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 388 +[2025-04-26 22:58:05,473] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.57 | bwd_microstep: 93.93 | bwd_inner_microstep: 93.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 22:58:05,977] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.04 | bwd_microstep: 323.89 | bwd_inner_microstep: 323.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1408 +[2025-04-26 22:58:06,351] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.05 | bwd_microstep: 239.86 | bwd_inner_microstep: 239.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:58:06,838] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 311.02 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:58:07,328] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.97 | bwd_microstep: 311.79 | bwd_inner_microstep: 311.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:58:07,818] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:58:08,306] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:58:08,795] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 310.95 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 890 +[2025-04-26 22:58:12,275] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.47 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 22:58:12,275] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.64 | bwd_microstep: 2118.93 | bwd_inner_microstep: 191.07 | bwd_allreduce_microstep: 1927.82 | step_microstep: 1271.33 +[2025-04-26 22:58:12,277] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4742.37 | bwd: 10604.48 | bwd_inner: 8676.20 | bwd_allreduce: 1927.94 | step: 1272.47 + 84%|████████▍ | 261/309 [1:16:44<13:52, 17.33s/it] {'loss': 0.233, 'learning_rate': 2.490088870295839e-06, 'epoch': 2.51} + 84%|████████▍ | 261/309 [1:16:44<13:52, 17.33s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:58:12,747] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 294.47 | bwd_inner_microstep: 294.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:58:13,235] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:58:13,721] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 309.39 | bwd_inner_microstep: 309.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:58:14,210] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.21 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:58:14,698] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 309.87 | bwd_inner_microstep: 309.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:58:15,185] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:58:15,673] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:16,161] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:58:16,650] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:58:17,137] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 309.87 | bwd_inner_microstep: 309.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:17,625] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:18,114] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.29 | bwd_microstep: 310.04 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:58:18,600] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 309.55 | bwd_inner_microstep: 309.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:19,088] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.24 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:19,575] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:20,064] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:58:20,552] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.83 | bwd_microstep: 311.23 | bwd_inner_microstep: 311.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:58:21,040] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 310.58 | bwd_inner_microstep: 310.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:58:21,527] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 309.80 | bwd_inner_microstep: 309.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:58:22,013] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 308.90 | bwd_inner_microstep: 308.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 355 +[2025-04-26 22:58:22,138] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.04 | bwd_microstep: 83.50 | bwd_inner_microstep: 83.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 867 +[2025-04-26 22:58:22,388] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 84.51 | bwd_microstep: 160.87 | bwd_inner_microstep: 160.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1962 +[2025-04-26 22:58:22,896] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.82 | bwd_microstep: 326.05 | bwd_inner_microstep: 326.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1942 +[2025-04-26 22:58:23,404] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.13 | bwd_microstep: 324.72 | bwd_inner_microstep: 324.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 22:58:23,909] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.95 | bwd_microstep: 322.09 | bwd_inner_microstep: 322.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:58:24,398] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.92 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:58:24,888] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.34 | bwd_microstep: 311.76 | bwd_inner_microstep: 311.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:58:25,376] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:58:25,865] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 311.17 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:58:26,353] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.21 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:58:26,842] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:58:29,305] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1205.13 | optimizer_gradients: 17.50 | optimizer_step: 32.03 +[2025-04-26 22:58:29,306] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.37 | bwd_microstep: 1013.72 | bwd_inner_microstep: 624.62 | bwd_allreduce_microstep: 389.07 | step_microstep: 1272.71 +[2025-04-26 22:58:29,307] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5307.62 | bwd: 10286.83 | bwd_inner: 9897.30 | bwd_allreduce: 389.19 | step: 1273.85 + 85%|████████▍ | 262/309 [1:17:01<13:30, 17.24s/it] {'loss': 0.2557, 'learning_rate': 2.3895121148260027e-06, 'epoch': 2.52} + 85%|████████▍ | 262/309 [1:17:01<13:30, 17.24s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:58:29,778] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 294.04 | bwd_inner_microstep: 294.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:58:30,265] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 311.66 | bwd_inner_microstep: 311.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:30,753] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.08 | bwd_microstep: 310.13 | bwd_inner_microstep: 310.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:58:31,240] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:31,729] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.71 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:58:32,216] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1908 +[2025-04-26 22:58:32,704] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 310.86 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:33,192] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.07 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:58:33,679] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:34,167] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.13 | bwd_microstep: 310.33 | bwd_inner_microstep: 310.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:58:34,654] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:35,142] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.58 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 880 +[2025-04-26 22:58:35,394] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.52 | bwd_microstep: 162.33 | bwd_inner_microstep: 162.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:35,881] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 311.23 | bwd_inner_microstep: 311.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:36,368] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:58:36,855] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 22:58:36,985] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.13 | bwd_microstep: 85.84 | bwd_inner_microstep: 85.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:58:37,470] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:58:37,957] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 310.42 | bwd_inner_microstep: 310.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:58:38,444] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:58:38,931] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 309.24 | bwd_inner_microstep: 309.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:58:39,418] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 309.68 | bwd_inner_microstep: 309.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1925 +[2025-04-26 22:58:39,924] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.02 | bwd_microstep: 322.87 | bwd_inner_microstep: 322.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:58:40,412] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 382 +[2025-04-26 22:58:40,545] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.77 | bwd_microstep: 87.23 | bwd_inner_microstep: 87.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 22:58:40,674] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.02 | bwd_microstep: 87.16 | bwd_inner_microstep: 87.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:58:41,163] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 312.04 | bwd_inner_microstep: 312.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:58:41,651] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 312.02 | bwd_inner_microstep: 312.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:58:42,141] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 311.44 | bwd_inner_microstep: 311.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:58:42,629] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:58:43,119] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.32 | bwd_microstep: 311.18 | bwd_inner_microstep: 311.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:58:46,541] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.77 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 22:58:46,542] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 1974.65 | bwd_inner_microstep: 338.68 | bwd_allreduce_microstep: 1635.94 | step_microstep: 1270.58 +[2025-04-26 22:58:46,543] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5028.26 | bwd: 10781.49 | bwd_inner: 9145.08 | bwd_allreduce: 1636.06 | step: 1271.65 + 85%|████████▌ | 263/309 [1:17:19<13:13, 17.24s/it] {'loss': 0.1154, 'learning_rate': 2.2908794869358044e-06, 'epoch': 2.53} + 85%|████████▌ | 263/309 [1:17:19<13:13, 17.24s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:58:47,015] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 294.75 | bwd_inner_microstep: 294.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:47,503] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:58:47,989] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 310.43 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:58:48,121] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.74 | bwd_microstep: 86.92 | bwd_inner_microstep: 86.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:58:48,606] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:58:49,094] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 311.65 | bwd_inner_microstep: 311.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:49,581] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:50,069] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:50,557] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:51,043] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:51,531] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.77 | bwd_microstep: 310.45 | bwd_inner_microstep: 310.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:58:52,019] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:58:52,506] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:58:52,994] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:58:53,482] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:58:53,969] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:58:54,457] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.94 | bwd_microstep: 309.96 | bwd_inner_microstep: 309.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:58:54,945] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 309.84 | bwd_inner_microstep: 309.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:58:55,432] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:58:55,919] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 309.57 | bwd_inner_microstep: 309.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:58:56,407] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.93 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:58:56,893] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 309.16 | bwd_inner_microstep: 309.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 4, images per sample: 4.0, dynamic token length: 1206 +[2025-04-26 22:58:57,210] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 108.84 | bwd_microstep: 204.01 | bwd_inner_microstep: 204.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 387 +[2025-04-26 22:58:57,350] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.33 | bwd_microstep: 93.39 | bwd_inner_microstep: 93.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1924 +[2025-04-26 22:58:57,856] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.47 | bwd_microstep: 323.11 | bwd_inner_microstep: 323.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:58:58,344] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 311.64 | bwd_inner_microstep: 311.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:58:58,833] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 311.13 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:58:59,321] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:58:59,810] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.04 | bwd_microstep: 311.07 | bwd_inner_microstep: 311.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:58:59,941] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.42 | bwd_microstep: 86.77 | bwd_inner_microstep: 86.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:59:00,427] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:59:03,332] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.95 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 22:59:03,333] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 1451.77 | bwd_inner_microstep: 340.14 | bwd_allreduce_microstep: 1111.59 | step_microstep: 1276.92 +[2025-04-26 22:59:03,334] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5057.83 | bwd: 10304.25 | bwd_inner: 9192.19 | bwd_allreduce: 1111.71 | step: 1277.94 + 85%|████████▌ | 264/309 [1:17:35<12:49, 17.11s/it] {'loss': 0.1874, 'learning_rate': 2.1942018752737227e-06, 'epoch': 2.54} + 85%|████████▌ | 264/309 [1:17:35<12:49, 17.11s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:03,804] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 293.57 | bwd_inner_microstep: 293.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:59:04,293] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.63 | bwd_microstep: 312.76 | bwd_inner_microstep: 312.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:59:04,781] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.41 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:59:05,268] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 310.45 | bwd_inner_microstep: 310.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:05,757] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.39 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:59:06,245] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:59:06,733] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 311.34 | bwd_inner_microstep: 311.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:07,220] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.77 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:07,707] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 22:59:07,838] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.11 | bwd_microstep: 86.69 | bwd_inner_microstep: 86.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:59:08,324] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:08,811] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:59:09,298] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 309.78 | bwd_inner_microstep: 309.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:09,785] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 310.31 | bwd_inner_microstep: 310.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:59:10,274] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.29 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:59:10,761] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 309.80 | bwd_inner_microstep: 309.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:59:11,247] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 309.78 | bwd_inner_microstep: 309.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:59:11,735] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.34 | bwd_microstep: 309.74 | bwd_inner_microstep: 309.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:59:12,222] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 309.87 | bwd_inner_microstep: 309.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:59:12,710] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:59:13,195] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 308.65 | bwd_inner_microstep: 308.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:59:13,681] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 308.96 | bwd_inner_microstep: 308.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 964 +[2025-04-26 22:59:13,958] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 94.00 | bwd_microstep: 178.04 | bwd_inner_microstep: 178.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 22:59:14,462] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.68 | bwd_microstep: 323.59 | bwd_inner_microstep: 323.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:59:14,950] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 311.87 | bwd_inner_microstep: 311.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 22:59:15,438] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.04 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1404 +[2025-04-26 22:59:15,811] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.77 | bwd_microstep: 239.93 | bwd_inner_microstep: 239.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:59:16,299] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 312.27 | bwd_inner_microstep: 312.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:59:16,788] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 311.69 | bwd_inner_microstep: 311.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:59:17,276] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:59:17,765] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 311.50 | bwd_inner_microstep: 311.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 22:59:20,941] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.41 | optimizer_gradients: 17.51 | optimizer_step: 32.02 +[2025-04-26 22:59:20,942] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 1728.09 | bwd_inner_microstep: 338.61 | bwd_allreduce_microstep: 1389.45 | step_microstep: 1271.00 +[2025-04-26 22:59:20,943] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5259.74 | bwd: 10925.62 | bwd_inner: 9535.71 | bwd_allreduce: 1389.57 | step: 1272.02 + 86%|████████▌ | 265/309 [1:17:53<12:39, 17.26s/it] {'loss': 0.2139, 'learning_rate': 2.099489952662248e-06, 'epoch': 2.55} + 86%|████████▌ | 265/309 [1:17:53<12:39, 17.26s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:21,414] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 294.43 | bwd_inner_microstep: 294.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:59:21,901] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 310.74 | bwd_inner_microstep: 310.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:59:22,389] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:59:22,877] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 22:59:23,008] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.18 | bwd_microstep: 86.81 | bwd_inner_microstep: 86.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:59:23,495] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 311.58 | bwd_inner_microstep: 311.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:23,982] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:24,469] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:59:24,958] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 311.88 | bwd_inner_microstep: 311.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:59:25,446] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:25,933] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 310.61 | bwd_inner_microstep: 310.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:59:26,420] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 309.87 | bwd_inner_microstep: 309.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:59:26,551] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.45 | bwd_microstep: 86.43 | bwd_inner_microstep: 86.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:27,039] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 311.60 | bwd_inner_microstep: 311.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:59:27,526] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:28,014] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 311.19 | bwd_inner_microstep: 311.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:28,502] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 22:59:28,990] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.04 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:59:29,477] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 309.78 | bwd_inner_microstep: 309.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:59:29,964] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 22:59:30,453] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 309.22 | bwd_inner_microstep: 309.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1964 +[2025-04-26 22:59:30,963] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.47 | bwd_microstep: 327.02 | bwd_inner_microstep: 327.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1939 +[2025-04-26 22:59:31,470] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.51 | bwd_microstep: 323.98 | bwd_inner_microstep: 323.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 22:59:31,958] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 311.34 | bwd_inner_microstep: 311.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:59:32,448] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.81 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:59:32,936] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 311.38 | bwd_inner_microstep: 311.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:59:33,426] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.90 | bwd_microstep: 311.24 | bwd_inner_microstep: 311.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:59:33,913] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 310.93 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 22:59:34,402] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.23 | bwd_microstep: 311.31 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:59:34,891] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.36 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:35,379] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 311.00 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:37,847] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.99 | optimizer_gradients: 17.53 | optimizer_step: 32.02 +[2025-04-26 22:59:37,848] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.33 | bwd_microstep: 1017.62 | bwd_inner_microstep: 627.42 | bwd_allreduce_microstep: 390.16 | step_microstep: 1272.86 +[2025-04-26 22:59:37,849] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5262.43 | bwd: 10216.49 | bwd_inner: 9825.86 | bwd_allreduce: 390.28 | step: 1273.88 + 86%|████████▌ | 266/309 [1:18:10<12:17, 17.15s/it] {'loss': 0.2447, 'learning_rate': 2.0067541749196453e-06, 'epoch': 2.56} + 86%|████████▌ | 266/309 [1:18:10<12:17, 17.15s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:59:38,320] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 294.26 | bwd_inner_microstep: 294.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:59:38,809] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 311.86 | bwd_inner_microstep: 311.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:59:39,296] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:39,784] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:40,272] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:59:40,760] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 22:59:41,249] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.24 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:41,738] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 311.57 | bwd_inner_microstep: 311.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:42,228] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.40 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 22:59:42,716] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:59:43,204] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 22:59:43,691] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:44,180] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 310.83 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:44,669] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:59:45,157] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:45,646] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 880 +[2025-04-26 22:59:45,900] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.94 | bwd_microstep: 162.76 | bwd_inner_microstep: 162.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:59:46,387] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:59:46,874] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.63 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 22:59:47,364] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 311.08 | bwd_inner_microstep: 311.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 22:59:47,494] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.15 | bwd_microstep: 86.46 | bwd_inner_microstep: 86.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 22:59:47,980] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 309.71 | bwd_inner_microstep: 309.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1986 +[2025-04-26 22:59:48,493] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.61 | bwd_microstep: 329.52 | bwd_inner_microstep: 329.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:59:48,982] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.16 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:59:49,471] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 311.44 | bwd_inner_microstep: 311.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 22:59:49,961] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.00 | bwd_microstep: 312.00 | bwd_inner_microstep: 311.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:59:50,450] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.25 | bwd_microstep: 311.23 | bwd_inner_microstep: 311.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 22:59:50,939] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 22:59:51,428] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.15 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:51,916] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 311.43 | bwd_inner_microstep: 311.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:59:52,404] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 310.99 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:55,097] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.62 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 22:59:55,097] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 1244.45 | bwd_inner_microstep: 339.76 | bwd_allreduce_microstep: 904.66 | step_microstep: 1271.35 +[2025-04-26 22:59:55,099] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5305.32 | bwd: 10511.01 | bwd_inner: 9605.89 | bwd_allreduce: 904.78 | step: 1272.49 + 86%|████████▋ | 267/309 [1:18:27<12:01, 17.18s/it] {'loss': 0.2365, 'learning_rate': 1.916004779705669e-06, 'epoch': 2.57} + 86%|████████▋ | 267/309 [1:18:27<12:01, 17.18s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:55,570] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 294.06 | bwd_inner_microstep: 294.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:56,057] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 311.08 | bwd_inner_microstep: 311.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 22:59:56,545] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:59:56,675] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.05 | bwd_microstep: 86.53 | bwd_inner_microstep: 86.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:57,161] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:57,649] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 311.63 | bwd_inner_microstep: 311.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 22:59:57,780] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.20 | bwd_microstep: 86.50 | bwd_inner_microstep: 86.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:58,266] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 311.17 | bwd_inner_microstep: 311.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 22:59:58,753] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 311.70 | bwd_inner_microstep: 311.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 22:59:59,241] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 22:59:59,729] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 310.95 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:00,217] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:00:00,705] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:01,194] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:01,683] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 311.42 | bwd_inner_microstep: 311.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 369 +[2025-04-26 23:00:01,814] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.08 | bwd_microstep: 86.20 | bwd_inner_microstep: 86.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:00:02,300] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 310.05 | bwd_inner_microstep: 310.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 23:00:02,430] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.78 | bwd_microstep: 86.01 | bwd_inner_microstep: 85.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:00:02,917] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:00:03,406] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 311.23 | bwd_inner_microstep: 311.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:00:03,893] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 309.99 | bwd_inner_microstep: 309.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 414 +[2025-04-26 23:00:04,035] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.98 | bwd_microstep: 95.43 | bwd_inner_microstep: 95.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1999 +[2025-04-26 23:00:04,548] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.02 | bwd_microstep: 330.71 | bwd_inner_microstep: 330.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:00:05,036] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 312.03 | bwd_inner_microstep: 312.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1924 +[2025-04-26 23:00:05,542] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.45 | bwd_microstep: 322.30 | bwd_inner_microstep: 322.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 23:00:06,048] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.31 | bwd_microstep: 323.00 | bwd_inner_microstep: 322.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:00:06,544] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.49 | bwd_microstep: 317.79 | bwd_inner_microstep: 317.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 23:00:06,675] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.48 | bwd_microstep: 87.09 | bwd_inner_microstep: 87.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:00:07,162] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 311.20 | bwd_inner_microstep: 311.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:00:07,650] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 311.44 | bwd_inner_microstep: 311.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 371 +[2025-04-26 23:00:07,780] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.94 | bwd_microstep: 86.37 | bwd_inner_microstep: 86.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:00:11,849] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1206.17 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 23:00:11,849] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 2620.64 | bwd_inner_microstep: 339.17 | bwd_allreduce_microstep: 2281.43 | step_microstep: 1274.08 +[2025-04-26 23:00:11,851] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4597.05 | bwd: 10731.44 | bwd_inner: 8449.54 | bwd_allreduce: 2281.55 | step: 1275.14 + 87%|████████▋ | 268/309 [1:18:44<11:39, 17.05s/it] {'loss': 0.2033, 'learning_rate': 1.8272517853913775e-06, 'epoch': 2.58} + 87%|████████▋ | 268/309 [1:18:44<11:39, 17.05s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:00:12,321] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 294.12 | bwd_inner_microstep: 294.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:00:12,809] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 311.34 | bwd_inner_microstep: 311.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:00:13,296] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 310.40 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:00:13,783] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.77 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 23:00:13,914] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.23 | bwd_microstep: 86.56 | bwd_inner_microstep: 86.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:14,399] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 310.58 | bwd_inner_microstep: 310.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 378 +[2025-04-26 23:00:14,530] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.65 | bwd_microstep: 86.68 | bwd_inner_microstep: 86.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:00:15,016] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:15,503] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:15,989] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 309.93 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:00:16,476] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.92 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:00:16,607] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.47 | bwd_microstep: 86.51 | bwd_inner_microstep: 86.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:17,094] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:00:17,581] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:00:18,067] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 309.66 | bwd_inner_microstep: 309.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:18,554] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.27 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:00:19,041] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 309.73 | bwd_inner_microstep: 309.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:19,529] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 881 +[2025-04-26 23:00:19,780] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.40 | bwd_microstep: 162.27 | bwd_inner_microstep: 162.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:00:20,265] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:00:20,752] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 310.18 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1986 +[2025-04-26 23:00:21,264] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.60 | bwd_microstep: 329.06 | bwd_inner_microstep: 329.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1939 +[2025-04-26 23:00:21,771] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.59 | bwd_microstep: 324.58 | bwd_inner_microstep: 324.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:00:22,260] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.14 | bwd_microstep: 310.88 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:00:22,748] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 311.46 | bwd_inner_microstep: 311.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:00:23,239] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.19 | bwd_microstep: 311.94 | bwd_inner_microstep: 311.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:00:23,726] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 310.38 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:00:23,856] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.16 | bwd_microstep: 86.75 | bwd_inner_microstep: 86.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:00:24,344] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 312.01 | bwd_inner_microstep: 312.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:00:24,832] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 311.66 | bwd_inner_microstep: 311.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:00:25,320] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 311.10 | bwd_inner_microstep: 311.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 891 +[2025-04-26 23:00:29,243] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.72 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 23:00:29,243] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.30 | bwd_microstep: 2562.32 | bwd_inner_microstep: 190.86 | bwd_allreduce_microstep: 2371.42 | step_microstep: 1270.38 +[2025-04-26 23:00:29,245] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4812.06 | bwd: 11164.62 | bwd_inner: 8792.74 | bwd_allreduce: 2371.54 | step: 1271.37 + 87%|████████▋ | 269/309 [1:19:01<11:26, 17.15s/it] {'loss': 0.2648, 'learning_rate': 1.740504989953129e-06, 'epoch': 2.59} + 87%|████████▋ | 269/309 [1:19:01<11:26, 17.15s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:00:29,715] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 293.68 | bwd_inner_microstep: 293.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:00:30,201] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:00:30,688] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 23:00:30,818] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.75 | bwd_microstep: 86.35 | bwd_inner_microstep: 86.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:00:31,304] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 311.15 | bwd_inner_microstep: 311.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:00:31,791] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:00:32,277] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:32,763] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 309.96 | bwd_inner_microstep: 309.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:00:32,893] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.67 | bwd_microstep: 86.46 | bwd_inner_microstep: 86.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:33,379] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 310.93 | bwd_inner_microstep: 310.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:00:33,865] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 309.78 | bwd_inner_microstep: 309.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:00:34,351] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 309.93 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:00:34,838] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 309.74 | bwd_inner_microstep: 309.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:00:35,324] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 309.68 | bwd_inner_microstep: 309.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:35,811] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:00:36,298] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:00:36,785] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 309.93 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:00:37,272] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:00:37,759] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 309.79 | bwd_inner_microstep: 309.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:00:38,245] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 309.06 | bwd_inner_microstep: 309.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:00:38,732] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 309.81 | bwd_inner_microstep: 309.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:00:39,218] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 308.78 | bwd_inner_microstep: 308.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 915 +[2025-04-26 23:00:39,486] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 92.01 | bwd_microstep: 172.58 | bwd_inner_microstep: 172.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 23:00:39,990] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.11 | bwd_microstep: 322.83 | bwd_inner_microstep: 322.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:00:40,477] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 311.24 | bwd_inner_microstep: 311.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:00:40,965] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:00:41,454] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 311.02 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 23:00:41,585] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.48 | bwd_microstep: 86.96 | bwd_inner_microstep: 86.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:00:42,073] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 311.94 | bwd_inner_microstep: 311.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:00:42,560] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 311.35 | bwd_inner_microstep: 311.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:00:43,048] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.00 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:00:47,285] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1205.06 | optimizer_gradients: 17.52 | optimizer_step: 32.02 +[2025-04-26 23:00:47,286] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 2788.36 | bwd_inner_microstep: 339.53 | bwd_allreduce_microstep: 2448.80 | step_microstep: 1272.90 +[2025-04-26 23:00:47,287] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5027.93 | bwd: 11594.00 | bwd_inner: 9144.74 | bwd_allreduce: 2448.92 | step: 1273.90 + 87%|████████▋ | 270/309 [1:19:19<11:19, 17.42s/it] {'loss': 0.1375, 'learning_rate': 1.6557739698909436e-06, 'epoch': 2.6} + 87%|████████▋ | 270/309 [1:19:19<11:19, 17.42s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:00:47,758] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 293.95 | bwd_inner_microstep: 293.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 23:00:47,889] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.78 | bwd_microstep: 86.61 | bwd_inner_microstep: 86.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:00:48,373] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.04 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:00:48,860] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:00:48,990] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.71 | bwd_microstep: 86.65 | bwd_inner_microstep: 86.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:00:49,475] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:49,961] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:00:50,447] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 309.70 | bwd_inner_microstep: 309.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:00:50,934] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:00:51,421] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 309.79 | bwd_inner_microstep: 309.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:00:51,551] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.86 | bwd_microstep: 86.50 | bwd_inner_microstep: 86.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:00:52,038] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 312.19 | bwd_inner_microstep: 312.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:00:52,525] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:00:53,011] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 309.82 | bwd_inner_microstep: 309.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:00:53,498] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 310.01 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:00:53,985] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.87 | bwd_microstep: 309.71 | bwd_inner_microstep: 309.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:00:54,472] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 309.38 | bwd_inner_microstep: 309.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:00:54,958] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 309.79 | bwd_inner_microstep: 309.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:00:55,445] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 309.37 | bwd_inner_microstep: 309.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:00:55,930] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 309.20 | bwd_inner_microstep: 309.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 23:00:56,417] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 309.66 | bwd_inner_microstep: 309.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:00:56,903] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 309.38 | bwd_inner_microstep: 309.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 23:00:57,407] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.15 | bwd_microstep: 322.88 | bwd_inner_microstep: 322.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:00:57,895] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 311.49 | bwd_inner_microstep: 311.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:00:58,382] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 309.82 | bwd_inner_microstep: 309.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:00:58,869] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1404 +[2025-04-26 23:00:59,243] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.88 | bwd_microstep: 240.44 | bwd_inner_microstep: 240.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 23:00:59,374] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.47 | bwd_microstep: 87.19 | bwd_inner_microstep: 87.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:00:59,860] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:00,347] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:01:00,834] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:01:04,230] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.61 | optimizer_gradients: 17.51 | optimizer_step: 32.04 +[2025-04-26 23:01:04,231] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 1947.93 | bwd_inner_microstep: 337.77 | bwd_allreduce_microstep: 1610.13 | step_microstep: 1271.23 +[2025-04-26 23:01:04,232] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4930.52 | bwd: 10596.03 | bwd_inner: 8985.44 | bwd_allreduce: 1610.25 | step: 1272.27 + 88%|████████▊ | 271/309 [1:19:36<10:56, 17.28s/it] {'loss': 0.1661, 'learning_rate': 1.573068079171265e-06, 'epoch': 2.61} + 88%|████████▊ | 271/309 [1:19:36<10:56, 17.28s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:04,704] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 294.64 | bwd_inner_microstep: 294.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:05,191] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:05,676] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1404 +[2025-04-26 23:01:06,049] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.37 | bwd_microstep: 239.64 | bwd_inner_microstep: 239.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 23:01:06,421] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.17 | bwd_microstep: 240.19 | bwd_inner_microstep: 240.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1402 +[2025-04-26 23:01:06,794] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 127.29 | bwd_microstep: 240.93 | bwd_inner_microstep: 240.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 886 +[2025-04-26 23:01:07,045] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.07 | bwd_microstep: 162.38 | bwd_inner_microstep: 162.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:07,531] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:01:08,018] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 310.05 | bwd_inner_microstep: 310.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:08,504] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:08,990] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 309.85 | bwd_inner_microstep: 309.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:09,477] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 309.76 | bwd_inner_microstep: 309.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1399 +[2025-04-26 23:01:09,850] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.83 | bwd_microstep: 239.53 | bwd_inner_microstep: 239.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:10,336] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:10,823] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 310.26 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 878 +[2025-04-26 23:01:11,075] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.88 | bwd_microstep: 161.69 | bwd_inner_microstep: 161.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:01:11,561] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.06 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:01:12,046] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.19 | bwd_microstep: 309.46 | bwd_inner_microstep: 309.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:01:12,533] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 309.70 | bwd_inner_microstep: 309.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:01:13,019] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 309.62 | bwd_inner_microstep: 309.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:01:13,505] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 309.14 | bwd_inner_microstep: 309.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:01:13,990] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 308.93 | bwd_inner_microstep: 308.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1925 +[2025-04-26 23:01:14,495] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.36 | bwd_microstep: 322.31 | bwd_inner_microstep: 322.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1927 +[2025-04-26 23:01:15,001] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.06 | bwd_microstep: 322.80 | bwd_inner_microstep: 322.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 23:01:15,506] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.42 | bwd_microstep: 322.05 | bwd_inner_microstep: 322.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:15,994] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 310.88 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:01:16,483] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.93 | bwd_microstep: 311.63 | bwd_inner_microstep: 311.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1403 +[2025-04-26 23:01:16,857] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.21 | bwd_microstep: 239.87 | bwd_inner_microstep: 239.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:01:17,343] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:17,832] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 311.84 | bwd_inner_microstep: 311.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:01:18,320] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:01:22,028] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.97 | optimizer_gradients: 17.50 | optimizer_step: 32.02 +[2025-04-26 23:01:22,028] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 2260.83 | bwd_inner_microstep: 339.97 | bwd_allreduce_microstep: 1920.82 | step_microstep: 1271.69 +[2025-04-26 23:01:22,030] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5127.00 | bwd: 11251.45 | bwd_inner: 9330.17 | bwd_allreduce: 1920.94 | step: 1272.74 + 88%|████████▊ | 272/309 [1:19:54<10:45, 17.43s/it] {'loss': 0.2003, 'learning_rate': 1.4923964481943599e-06, 'epoch': 2.62} + 88%|████████▊ | 272/309 [1:19:54<10:45, 17.43s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:01:22,500] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 293.51 | bwd_inner_microstep: 293.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:01:22,987] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:01:23,473] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 309.75 | bwd_inner_microstep: 309.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:01:23,959] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 309.89 | bwd_inner_microstep: 309.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:01:24,445] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:24,933] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:01:25,064] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.12 | bwd_microstep: 86.55 | bwd_inner_microstep: 86.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:25,550] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:26,037] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 310.01 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:01:26,522] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 309.41 | bwd_inner_microstep: 309.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:27,010] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:27,497] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:01:27,628] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.03 | bwd_microstep: 86.53 | bwd_inner_microstep: 86.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 23:01:27,756] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.78 | bwd_microstep: 86.16 | bwd_inner_microstep: 86.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:01:28,242] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:01:28,728] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:01:29,214] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 309.79 | bwd_inner_microstep: 309.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:01:29,700] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 309.57 | bwd_inner_microstep: 309.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:01:30,187] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 309.90 | bwd_inner_microstep: 309.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:01:30,672] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 309.06 | bwd_inner_microstep: 309.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:01:31,158] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 309.35 | bwd_inner_microstep: 309.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1966 +[2025-04-26 23:01:31,667] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.22 | bwd_microstep: 326.39 | bwd_inner_microstep: 326.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 392 +[2025-04-26 23:01:31,807] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.52 | bwd_microstep: 93.82 | bwd_inner_microstep: 93.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:32,295] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 311.54 | bwd_inner_microstep: 311.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:01:32,783] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 311.82 | bwd_inner_microstep: 311.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 23:01:32,915] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.74 | bwd_microstep: 87.28 | bwd_inner_microstep: 87.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:01:33,401] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 311.35 | bwd_inner_microstep: 311.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:01:33,889] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:01:34,375] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:34,863] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 310.95 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:35,351] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:01:40,211] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.68 | optimizer_gradients: 17.53 | optimizer_step: 32.04 +[2025-04-26 23:01:40,211] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 3413.07 | bwd_inner_microstep: 338.70 | bwd_allreduce_microstep: 3074.34 | step_microstep: 1270.51 +[2025-04-26 23:01:40,213] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4846.27 | bwd: 11921.63 | bwd_inner: 8846.83 | bwd_allreduce: 3074.46 | step: 1271.52 + 88%|████████▊ | 273/309 [1:20:12<10:35, 17.66s/it] {'loss': 0.2477, 'learning_rate': 1.4137679827863293e-06, 'epoch': 2.63} + 88%|████████▊ | 273/309 [1:20:12<10:35, 17.66s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:01:40,681] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.73 | bwd_microstep: 293.41 | bwd_inner_microstep: 293.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:01:41,169] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 312.20 | bwd_inner_microstep: 312.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:01:41,655] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 309.64 | bwd_inner_microstep: 309.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:01:42,141] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:01:42,627] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 309.73 | bwd_inner_microstep: 309.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:43,114] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:01:43,601] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:01:44,089] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 311.07 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:44,576] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:45,064] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:01:45,549] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 309.29 | bwd_inner_microstep: 309.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:01:46,035] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 309.52 | bwd_inner_microstep: 309.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:01:46,521] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 309.58 | bwd_inner_microstep: 309.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:47,007] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 309.62 | bwd_inner_microstep: 309.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:47,494] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 310.04 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:01:47,982] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:01:48,468] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 309.60 | bwd_inner_microstep: 309.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:01:48,954] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 309.26 | bwd_inner_microstep: 309.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:01:49,441] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.92 | bwd_microstep: 309.73 | bwd_inner_microstep: 309.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:01:49,927] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 309.34 | bwd_inner_microstep: 309.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 23:01:50,056] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.74 | bwd_microstep: 85.54 | bwd_inner_microstep: 85.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 23:01:50,541] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 309.71 | bwd_inner_microstep: 309.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 1000 +[2025-04-26 23:01:50,815] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 92.64 | bwd_microstep: 178.37 | bwd_inner_microstep: 178.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:01:51,303] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 310.99 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:01:51,791] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 310.85 | bwd_inner_microstep: 310.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:01:52,279] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:01:52,768] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 311.52 | bwd_inner_microstep: 311.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:53,255] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 23:01:53,386] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.21 | bwd_microstep: 86.46 | bwd_inner_microstep: 86.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:53,873] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 311.57 | bwd_inner_microstep: 311.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:54,361] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 311.45 | bwd_inner_microstep: 311.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:57,896] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.25 | optimizer_gradients: 17.57 | optimizer_step: 32.04 +[2025-04-26 23:01:57,896] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 2085.92 | bwd_inner_microstep: 338.55 | bwd_allreduce_microstep: 1747.33 | step_microstep: 1272.37 +[2025-04-26 23:01:57,898] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5155.61 | bwd: 11108.10 | bwd_inner: 9360.31 | bwd_allreduce: 1747.45 | step: 1273.38 + 89%|████████▊ | 274/309 [1:20:30<10:18, 17.67s/it] {'loss': 0.2486, 'learning_rate': 1.3371913632159506e-06, 'epoch': 2.64} + 89%|████████▊ | 274/309 [1:20:30<10:18, 17.67s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:01:58,369] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 294.71 | bwd_inner_microstep: 294.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:01:58,856] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:01:59,342] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 309.95 | bwd_inner_microstep: 309.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:01:59,830] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:02:00,318] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:02:00,805] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 310.06 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:01,292] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:02:01,779] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:02,267] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:02,753] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:03,240] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:03,728] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:04,215] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 309.68 | bwd_inner_microstep: 309.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:04,702] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:02:05,190] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:05,677] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:06,166] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.55 | bwd_microstep: 310.84 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:02:06,651] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 309.60 | bwd_inner_microstep: 309.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:02:07,138] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 309.74 | bwd_inner_microstep: 309.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 869 +[2025-04-26 23:02:07,389] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.63 | bwd_microstep: 161.72 | bwd_inner_microstep: 161.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:02:07,873] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 309.33 | bwd_inner_microstep: 309.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:02:08,359] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 309.84 | bwd_inner_microstep: 309.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1951 +[2025-04-26 23:02:08,866] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.21 | bwd_microstep: 324.58 | bwd_inner_microstep: 324.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:02:09,354] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:02:09,484] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.88 | bwd_microstep: 86.69 | bwd_inner_microstep: 86.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:02:09,972] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 311.67 | bwd_inner_microstep: 311.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:02:10,460] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:02:10,947] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 311.07 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:02:11,437] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.32 | bwd_microstep: 311.82 | bwd_inner_microstep: 311.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 382 +[2025-04-26 23:02:11,569] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.15 | bwd_microstep: 87.44 | bwd_inner_microstep: 87.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:02:12,055] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:14,717] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.59 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 23:02:14,717] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 1215.15 | bwd_inner_microstep: 339.67 | bwd_allreduce_microstep: 875.44 | step_microstep: 1270.31 +[2025-04-26 23:02:14,719] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5161.46 | bwd: 10240.32 | bwd_inner: 9364.42 | bwd_allreduce: 875.56 | step: 1271.35 + 89%|████████▉ | 275/309 [1:20:47<09:52, 17.41s/it] {'loss': 0.1322, 'learning_rate': 1.2626750432364077e-06, 'epoch': 2.65} + 89%|████████▉ | 275/309 [1:20:47<09:52, 17.41s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:15,188] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 293.92 | bwd_inner_microstep: 293.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:02:15,675] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 311.22 | bwd_inner_microstep: 311.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:16,162] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 309.97 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:02:16,649] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1404 +[2025-04-26 23:02:17,022] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.06 | bwd_microstep: 239.83 | bwd_inner_microstep: 239.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:02:17,507] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:02:17,995] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:18,483] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 310.91 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1908 +[2025-04-26 23:02:18,970] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:02:19,457] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 310.21 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:02:19,943] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 309.83 | bwd_inner_microstep: 309.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:02:20,430] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 309.97 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:02:20,918] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 310.06 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:21,405] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:02:21,892] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:02:22,380] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.77 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:02:22,867] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 23:02:22,996] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.95 | bwd_microstep: 85.93 | bwd_inner_microstep: 85.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:02:23,481] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 23:02:23,967] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 309.60 | bwd_inner_microstep: 309.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:02:24,453] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 308.56 | bwd_inner_microstep: 308.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 950 +[2025-04-26 23:02:24,725] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 92.11 | bwd_microstep: 176.14 | bwd_inner_microstep: 176.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1952 +[2025-04-26 23:02:25,232] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.34 | bwd_microstep: 325.37 | bwd_inner_microstep: 325.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 23:02:25,737] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.86 | bwd_microstep: 322.96 | bwd_inner_microstep: 322.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1929 +[2025-04-26 23:02:26,243] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.46 | bwd_microstep: 323.04 | bwd_inner_microstep: 323.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:02:26,730] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:02:27,220] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.74 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:02:27,708] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.95 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:02:28,197] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1403 +[2025-04-26 23:02:28,571] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.03 | bwd_microstep: 240.63 | bwd_inner_microstep: 240.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:29,057] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.80 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:32,108] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.96 | optimizer_gradients: 17.54 | optimizer_step: 32.04 +[2025-04-26 23:02:32,108] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 1602.89 | bwd_inner_microstep: 338.67 | bwd_allreduce_microstep: 1264.18 | step_microstep: 1271.67 +[2025-04-26 23:02:32,110] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5224.60 | bwd: 10749.03 | bwd_inner: 9484.38 | bwd_allreduce: 1264.30 | step: 1272.69 + 89%|████████▉ | 276/309 [1:21:04<09:34, 17.41s/it] {'loss': 0.2583, 'learning_rate': 1.1902272491520362e-06, 'epoch': 2.66} + 89%|████████▉ | 276/309 [1:21:04<09:34, 17.41s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:32,581] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 294.22 | bwd_inner_microstep: 294.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:33,068] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.53 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:02:33,555] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 309.96 | bwd_inner_microstep: 309.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:34,043] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 311.00 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:34,531] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 310.31 | bwd_inner_microstep: 310.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:35,017] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:02:35,506] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:02:35,994] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 311.08 | bwd_inner_microstep: 311.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:36,482] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 310.45 | bwd_inner_microstep: 310.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:36,969] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:02:37,458] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 311.47 | bwd_inner_microstep: 311.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:02:37,945] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 309.87 | bwd_inner_microstep: 309.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:02:38,076] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.34 | bwd_microstep: 86.32 | bwd_inner_microstep: 86.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:38,563] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 312.47 | bwd_inner_microstep: 312.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:39,051] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 311.20 | bwd_inner_microstep: 311.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:39,538] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 310.95 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:02:40,027] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:02:40,514] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 309.92 | bwd_inner_microstep: 309.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:02:41,001] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 310.01 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:02:41,488] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 309.83 | bwd_inner_microstep: 309.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2468 +[2025-04-26 23:02:42,090] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.86 | bwd_microstep: 397.26 | bwd_inner_microstep: 397.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 486 +[2025-04-26 23:02:42,240] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 44.08 | bwd_microstep: 101.52 | bwd_inner_microstep: 101.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1940 +[2025-04-26 23:02:42,745] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.96 | bwd_microstep: 324.83 | bwd_inner_microstep: 324.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:02:43,234] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 312.26 | bwd_inner_microstep: 312.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 23:02:43,741] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.98 | bwd_microstep: 323.38 | bwd_inner_microstep: 323.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:02:44,229] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 311.58 | bwd_inner_microstep: 311.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:02:44,718] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 382 +[2025-04-26 23:02:44,850] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.80 | bwd_microstep: 87.22 | bwd_inner_microstep: 87.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 23:02:44,979] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.24 | bwd_microstep: 87.11 | bwd_inner_microstep: 87.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 23:02:45,108] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.32 | bwd_microstep: 86.85 | bwd_inner_microstep: 86.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:02:45,594] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 888 +[2025-04-26 23:02:49,919] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.96 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 23:02:49,919] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 87.16 | bwd_microstep: 2964.96 | bwd_inner_microstep: 192.61 | bwd_allreduce_microstep: 2772.30 | step_microstep: 1269.64 +[2025-04-26 23:02:49,921] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4802.56 | bwd: 11591.93 | bwd_inner: 8819.16 | bwd_allreduce: 2772.42 | step: 1270.67 + 90%|████████▉ | 277/309 [1:21:22<09:20, 17.53s/it] {'loss': 0.2542, 'learning_rate': 1.119855978910165e-06, 'epoch': 2.67} + 90%|████████▉ | 277/309 [1:21:22<09:20, 17.53s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:50,391] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 293.71 | bwd_inner_microstep: 293.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:50,878] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 311.52 | bwd_inner_microstep: 311.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 377 +[2025-04-26 23:02:51,009] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.01 | bwd_microstep: 86.46 | bwd_inner_microstep: 86.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:51,495] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:02:51,981] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 311.00 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:02:52,468] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 309.88 | bwd_inner_microstep: 309.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:52,956] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:02:53,444] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:53,930] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:02:54,417] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 309.43 | bwd_inner_microstep: 309.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:54,905] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 23:02:55,035] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.68 | bwd_microstep: 86.01 | bwd_inner_microstep: 85.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:55,522] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 311.10 | bwd_inner_microstep: 311.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.12 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:02:55,653] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.01 | bwd_microstep: 86.55 | bwd_inner_microstep: 86.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:02:56,138] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:02:56,625] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 310.85 | bwd_inner_microstep: 310.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:02:56,755] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.77 | bwd_microstep: 86.57 | bwd_inner_microstep: 86.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:02:57,242] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 311.20 | bwd_inner_microstep: 311.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:02:57,728] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:02:58,215] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:02:58,704] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.84 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 367 +[2025-04-26 23:02:58,834] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.73 | bwd_microstep: 86.02 | bwd_inner_microstep: 86.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1941 +[2025-04-26 23:02:59,338] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.11 | bwd_microstep: 324.45 | bwd_inner_microstep: 324.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1925 +[2025-04-26 23:02:59,844] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.37 | bwd_microstep: 323.37 | bwd_inner_microstep: 323.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 392 +[2025-04-26 23:02:59,984] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.64 | bwd_microstep: 93.98 | bwd_inner_microstep: 93.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:03:00,472] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 312.35 | bwd_inner_microstep: 312.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 23:03:00,604] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.63 | bwd_microstep: 87.16 | bwd_inner_microstep: 87.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:03:01,091] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 311.19 | bwd_inner_microstep: 311.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:03:01,580] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:03:02,068] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:03:02,557] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:03:07,382] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.30 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 23:03:07,382] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.30 | bwd_microstep: 3375.54 | bwd_inner_microstep: 338.01 | bwd_allreduce_microstep: 3037.49 | step_microstep: 1272.09 +[2025-04-26 23:03:07,384] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4584.88 | bwd: 11453.06 | bwd_inner: 8415.12 | bwd_allreduce: 3037.61 | step: 1273.32 + 90%|████████▉ | 278/309 [1:21:40<09:02, 17.51s/it] {'loss': 0.2442, 'learning_rate': 1.0515690012181823e-06, 'epoch': 2.68} + 90%|████████▉ | 278/309 [1:21:40<09:02, 17.51s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:03:07,854] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.27 | bwd_microstep: 294.83 | bwd_inner_microstep: 294.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:03:08,341] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 310.18 | bwd_inner_microstep: 310.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:03:08,827] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 310.06 | bwd_inner_microstep: 310.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:03:09,314] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:03:09,801] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:03:10,289] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 310.97 | bwd_inner_microstep: 310.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:03:10,776] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:03:11,262] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 309.68 | bwd_inner_microstep: 309.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:11,749] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 309.80 | bwd_inner_microstep: 309.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:03:12,235] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.77 | bwd_microstep: 309.41 | bwd_inner_microstep: 309.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 23:03:12,608] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.72 | bwd_microstep: 239.56 | bwd_inner_microstep: 239.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 886 +[2025-04-26 23:03:12,860] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 84.94 | bwd_microstep: 162.54 | bwd_inner_microstep: 162.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 23:03:13,230] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 127.28 | bwd_microstep: 239.38 | bwd_inner_microstep: 239.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:03:13,717] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:03:14,203] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 309.97 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:03:14,690] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:03:15,176] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 309.71 | bwd_inner_microstep: 309.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:03:15,662] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 309.33 | bwd_inner_microstep: 309.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:03:16,148] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 309.57 | bwd_inner_microstep: 309.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:03:16,634] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 308.81 | bwd_inner_microstep: 308.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 357 +[2025-04-26 23:03:16,762] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.44 | bwd_microstep: 85.28 | bwd_inner_microstep: 85.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:03:17,246] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 309.56 | bwd_inner_microstep: 309.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1940 +[2025-04-26 23:03:17,752] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.81 | bwd_microstep: 324.02 | bwd_inner_microstep: 324.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 384 +[2025-04-26 23:03:17,883] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.08 | bwd_microstep: 87.08 | bwd_inner_microstep: 87.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 23:03:18,388] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.51 | bwd_microstep: 323.97 | bwd_inner_microstep: 323.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 23:03:18,892] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.69 | bwd_microstep: 322.74 | bwd_inner_microstep: 322.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 383 +[2025-04-26 23:03:19,022] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.91 | bwd_microstep: 87.22 | bwd_inner_microstep: 87.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:03:19,509] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 311.84 | bwd_inner_microstep: 311.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:03:19,997] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 311.15 | bwd_inner_microstep: 311.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:03:20,485] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 311.43 | bwd_inner_microstep: 311.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:03:20,973] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 311.16 | bwd_inner_microstep: 311.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:25,100] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.71 | optimizer_gradients: 17.52 | optimizer_step: 32.02 +[2025-04-26 23:03:25,101] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 2680.29 | bwd_inner_microstep: 339.27 | bwd_allreduce_microstep: 2340.97 | step_microstep: 1270.61 +[2025-04-26 23:03:25,102] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4939.11 | bwd: 11360.97 | bwd_inner: 9019.54 | bwd_allreduce: 2341.09 | step: 1271.59 + 90%|█████████ | 279/309 [1:21:57<08:47, 17.57s/it] {'loss': 0.2323, 'learning_rate': 9.853738546858893e-07, 'epoch': 2.69} + 90%|█████████ | 279/309 [1:21:57<08:47, 17.57s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:03:25,572] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 293.26 | bwd_inner_microstep: 293.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:03:26,059] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 311.86 | bwd_inner_microstep: 311.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:03:26,545] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 309.56 | bwd_inner_microstep: 309.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:03:27,032] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:03:27,521] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.06 | bwd_microstep: 309.85 | bwd_inner_microstep: 309.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:03:28,010] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 310.18 | bwd_inner_microstep: 310.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:03:28,499] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 309.87 | bwd_inner_microstep: 309.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:03:28,632] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.78 | bwd_microstep: 86.44 | bwd_inner_microstep: 86.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:29,120] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:29,607] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:03:30,094] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:30,581] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:31,069] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:31,556] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 310.01 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:03:32,043] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 309.76 | bwd_inner_microstep: 309.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:03:32,531] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.05 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:03:33,017] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 309.39 | bwd_inner_microstep: 309.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:03:33,504] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:03:33,990] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 309.77 | bwd_inner_microstep: 309.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:03:34,476] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 309.23 | bwd_inner_microstep: 309.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:03:34,962] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 308.74 | bwd_inner_microstep: 308.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1974 +[2025-04-26 23:03:35,472] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.18 | bwd_microstep: 326.52 | bwd_inner_microstep: 326.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1930 +[2025-04-26 23:03:35,979] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.94 | bwd_microstep: 322.89 | bwd_inner_microstep: 322.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.07 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:03:36,470] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:03:36,959] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:03:37,446] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 310.91 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:03:37,935] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:38,422] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:03:38,912] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.22 | bwd_microstep: 312.12 | bwd_inner_microstep: 312.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:03:39,400] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:03:39,889] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:03:43,188] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.81 | optimizer_gradients: 17.52 | optimizer_step: 32.05 +[2025-04-26 23:03:43,188] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 1850.11 | bwd_inner_microstep: 338.08 | bwd_allreduce_microstep: 1511.99 | step_microstep: 1271.69 +[2025-04-26 23:03:43,190] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5385.49 | bwd: 11256.81 | bwd_inner: 9744.35 | bwd_allreduce: 1512.11 | step: 1272.95 + 91%|█████████ | 280/309 [1:22:15<08:34, 17.73s/it] {'loss': 0.2138, 'learning_rate': 9.212778469932848e-07, 'epoch': 2.7} + 91%|█████████ | 280/309 [1:22:15<08:34, 17.73s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:03:43,660] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 293.71 | bwd_inner_microstep: 293.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:03:44,147] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 311.16 | bwd_inner_microstep: 311.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:44,633] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 309.77 | bwd_inner_microstep: 309.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:03:45,121] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 310.42 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 23:03:45,252] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.98 | bwd_microstep: 86.51 | bwd_inner_microstep: 86.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:03:45,738] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:03:46,225] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 310.33 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:03:46,712] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:03:47,199] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:47,687] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:48,175] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:03:48,662] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:49,150] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 309.96 | bwd_inner_microstep: 309.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:03:49,637] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:03:50,126] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 23:03:50,499] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.54 | bwd_microstep: 239.73 | bwd_inner_microstep: 239.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:03:50,984] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 309.68 | bwd_inner_microstep: 309.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:03:51,473] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 309.99 | bwd_inner_microstep: 309.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:03:51,962] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 309.98 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:03:52,448] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 309.41 | bwd_inner_microstep: 309.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:03:52,936] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 309.65 | bwd_inner_microstep: 309.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:03:53,424] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.26 | bwd_microstep: 309.32 | bwd_inner_microstep: 309.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 404 +[2025-04-26 23:03:53,566] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 43.04 | bwd_microstep: 95.08 | bwd_inner_microstep: 95.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 23:03:54,070] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.79 | bwd_microstep: 323.07 | bwd_inner_microstep: 323.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:03:54,558] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 390 +[2025-04-26 23:03:54,699] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.85 | bwd_microstep: 93.93 | bwd_inner_microstep: 93.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:03:55,187] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.80 | bwd_microstep: 311.38 | bwd_inner_microstep: 311.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:03:55,676] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 311.86 | bwd_inner_microstep: 311.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:03:56,164] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:03:56,653] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:03:57,141] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.83 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:04:00,379] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.91 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 23:04:00,380] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.45 | bwd_microstep: 1788.53 | bwd_inner_microstep: 340.00 | bwd_allreduce_microstep: 1448.48 | step_microstep: 1271.76 +[2025-04-26 23:04:00,381] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5071.15 | bwd: 10679.78 | bwd_inner: 9230.83 | bwd_allreduce: 1448.60 | step: 1272.98 + 91%|█████████ | 281/309 [1:22:33<08:11, 17.57s/it] {'loss': 0.183, 'learning_rate': 8.592880540838111e-07, 'epoch': 2.71} + 91%|█████████ | 281/309 [1:22:33<08:11, 17.57s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:04:00,853] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 294.10 | bwd_inner_microstep: 294.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:04:01,341] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 311.11 | bwd_inner_microstep: 311.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:04:01,829] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 309.83 | bwd_inner_microstep: 309.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:04:02,317] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:04:02,806] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 310.03 | bwd_inner_microstep: 310.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:04:03,294] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 310.61 | bwd_inner_microstep: 310.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:04:03,784] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.16 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:04:04,272] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:04:04,760] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:05,247] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.17 | bwd_microstep: 309.73 | bwd_inner_microstep: 309.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:05,735] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:06,223] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:06,712] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.20 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:04:07,201] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:04:07,689] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:04:08,176] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 310.43 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:04:08,664] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.00 | bwd_microstep: 309.49 | bwd_inner_microstep: 309.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:04:09,151] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:04:09,639] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.83 | bwd_microstep: 309.28 | bwd_inner_microstep: 309.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:04:10,126] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 309.46 | bwd_inner_microstep: 309.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:04:10,613] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 309.44 | bwd_inner_microstep: 309.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1965 +[2025-04-26 23:04:11,124] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.94 | bwd_microstep: 326.66 | bwd_inner_microstep: 326.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 926 +[2025-04-26 23:04:11,394] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 91.93 | bwd_microstep: 173.50 | bwd_inner_microstep: 173.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:04:11,885] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:04:12,376] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 312.03 | bwd_inner_microstep: 312.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:12,865] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:04:13,353] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 311.26 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:04:13,844] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.80 | bwd_microstep: 311.84 | bwd_inner_microstep: 311.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:04:14,332] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:04:14,821] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.32 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:15,310] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 311.34 | bwd_inner_microstep: 311.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:18,071] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.92 | optimizer_gradients: 17.52 | optimizer_step: 32.02 +[2025-04-26 23:04:18,072] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 1312.07 | bwd_inner_microstep: 339.30 | bwd_allreduce_microstep: 972.73 | step_microstep: 1272.85 +[2025-04-26 23:04:18,073] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5447.00 | bwd: 10800.26 | bwd_inner: 9827.07 | bwd_allreduce: 972.85 | step: 1274.02 + 91%|█████████▏| 282/309 [1:22:50<07:55, 17.60s/it] {'loss': 0.2025, 'learning_rate': 7.994113193832076e-07, 'epoch': 2.72} + 91%|█████████▏| 282/309 [1:22:50<07:55, 17.60s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:18,545] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 294.73 | bwd_inner_microstep: 294.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:04:19,033] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:04:19,520] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.63 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:20,008] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 378 +[2025-04-26 23:04:20,140] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.31 | bwd_microstep: 86.82 | bwd_inner_microstep: 86.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:20,626] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 311.46 | bwd_inner_microstep: 311.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:21,115] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 311.95 | bwd_inner_microstep: 311.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:21,603] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:22,091] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:22,580] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:23,068] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:23,558] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.95 | bwd_microstep: 311.13 | bwd_inner_microstep: 311.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:04:23,689] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.94 | bwd_microstep: 86.66 | bwd_inner_microstep: 86.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:24,176] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:24,663] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 311.08 | bwd_inner_microstep: 311.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:25,152] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.97 | bwd_microstep: 310.53 | bwd_inner_microstep: 310.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:04:25,640] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.62 | bwd_inner_microstep: 310.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:04:26,129] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.26 | bwd_microstep: 310.04 | bwd_inner_microstep: 310.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:04:26,617] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 310.14 | bwd_inner_microstep: 310.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:04:27,105] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:04:27,593] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 356 +[2025-04-26 23:04:27,723] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.19 | bwd_microstep: 85.15 | bwd_inner_microstep: 85.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1974 +[2025-04-26 23:04:28,233] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.71 | bwd_microstep: 328.00 | bwd_inner_microstep: 327.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 23:04:28,740] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.34 | bwd_microstep: 324.64 | bwd_inner_microstep: 324.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:04:29,230] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 312.39 | bwd_inner_microstep: 312.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:04:29,719] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 311.68 | bwd_inner_microstep: 311.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:04:30,209] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 311.68 | bwd_inner_microstep: 311.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:30,697] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:04:31,187] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 311.35 | bwd_inner_microstep: 311.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 23:04:31,319] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.28 | bwd_microstep: 87.32 | bwd_inner_microstep: 87.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 892 +[2025-04-26 23:04:31,572] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 84.88 | bwd_microstep: 163.47 | bwd_inner_microstep: 163.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:35,431] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.01 | optimizer_gradients: 17.54 | optimizer_step: 32.04 +[2025-04-26 23:04:35,432] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 2388.97 | bwd_inner_microstep: 339.41 | bwd_allreduce_microstep: 2049.52 | step_microstep: 1294.48 +[2025-04-26 23:04:35,433] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4904.83 | bwd: 10997.05 | bwd_inner: 8947.06 | bwd_allreduce: 2049.65 | step: 1295.64 + 92%|█████████▏| 283/309 [1:23:08<07:35, 17.53s/it] {'loss': 0.215, 'learning_rate': 7.416542530440174e-07, 'epoch': 2.73} + 92%|█████████▏| 283/309 [1:23:08<07:35, 17.53s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:04:35,906] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 295.60 | bwd_inner_microstep: 295.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 23:04:36,041] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 44.06 | bwd_microstep: 86.83 | bwd_inner_microstep: 86.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:04:36,529] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:04:37,017] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:37,503] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:04:37,993] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.25 | bwd_microstep: 311.02 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:04:38,481] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 311.73 | bwd_inner_microstep: 311.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:38,970] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.83 | bwd_microstep: 310.27 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:39,456] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 310.42 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 377 +[2025-04-26 23:04:39,587] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.23 | bwd_microstep: 86.63 | bwd_inner_microstep: 86.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:40,074] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.50 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:04:40,562] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:41,049] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 310.45 | bwd_inner_microstep: 310.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:04:41,537] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:42,025] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 23:04:42,398] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.66 | bwd_microstep: 239.88 | bwd_inner_microstep: 239.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:42,884] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 310.85 | bwd_inner_microstep: 310.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:04:43,373] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 311.08 | bwd_inner_microstep: 311.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:04:43,860] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 309.97 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:04:44,348] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.87 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:04:44,836] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:04:45,326] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 309.76 | bwd_inner_microstep: 309.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2000 +[2025-04-26 23:04:45,843] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.66 | bwd_microstep: 330.25 | bwd_inner_microstep: 330.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 23:04:46,350] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.79 | bwd_microstep: 323.38 | bwd_inner_microstep: 323.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:04:46,840] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 311.89 | bwd_inner_microstep: 311.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:04:47,330] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 311.88 | bwd_inner_microstep: 311.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:04:47,818] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 311.25 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:04:48,308] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.07 | bwd_microstep: 311.78 | bwd_inner_microstep: 311.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1908 +[2025-04-26 23:04:48,798] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 311.32 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:04:49,287] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 311.62 | bwd_inner_microstep: 311.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:04:49,776] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 311.21 | bwd_inner_microstep: 311.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.16 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:04:52,240] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.90 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 23:04:52,240] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 1013.64 | bwd_inner_microstep: 623.64 | bwd_allreduce_microstep: 389.96 | step_microstep: 1272.67 +[2025-04-26 23:04:52,242] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5216.88 | bwd: 10149.59 | bwd_inner: 9759.17 | bwd_allreduce: 390.08 | step: 1274.04 + 92%|█████████▏| 284/309 [1:23:24<07:12, 17.31s/it] {'loss': 0.2263, 'learning_rate': 6.860232312158554e-07, 'epoch': 2.73} + 92%|█████████▏| 284/309 [1:23:24<07:12, 17.31s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:52,715] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 295.62 | bwd_inner_microstep: 295.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:53,202] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:04:53,690] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:54,178] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.16 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:04:54,667] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:04:55,155] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:55,645] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.21 | bwd_microstep: 311.30 | bwd_inner_microstep: 311.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:04:56,134] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 311.73 | bwd_inner_microstep: 311.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:04:56,623] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 311.70 | bwd_inner_microstep: 311.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:04:57,111] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:57,600] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 311.22 | bwd_inner_microstep: 311.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:58,089] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.75 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:58,577] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 311.20 | bwd_inner_microstep: 311.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:59,065] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:04:59,555] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:05:00,044] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.27 | bwd_microstep: 310.38 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 367 +[2025-04-26 23:05:00,173] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.76 | bwd_microstep: 86.00 | bwd_inner_microstep: 85.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:05:00,660] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 310.88 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 367 +[2025-04-26 23:05:00,791] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.48 | bwd_microstep: 86.06 | bwd_inner_microstep: 86.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:05:01,277] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 310.38 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:05:01,764] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 309.99 | bwd_inner_microstep: 309.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:05:02,251] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 309.65 | bwd_inner_microstep: 309.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1974 +[2025-04-26 23:05:02,764] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.01 | bwd_microstep: 328.46 | bwd_inner_microstep: 328.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 902 +[2025-04-26 23:05:03,033] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 91.03 | bwd_microstep: 172.45 | bwd_inner_microstep: 172.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 901 +[2025-04-26 23:05:03,300] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 90.67 | bwd_microstep: 171.97 | bwd_inner_microstep: 171.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:05:03,789] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 311.58 | bwd_inner_microstep: 311.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 384 +[2025-04-26 23:05:03,920] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.32 | bwd_microstep: 87.52 | bwd_inner_microstep: 87.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:05:04,408] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 311.61 | bwd_inner_microstep: 311.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:05:04,898] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 312.78 | bwd_inner_microstep: 312.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 23:05:05,030] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.80 | bwd_microstep: 87.45 | bwd_inner_microstep: 87.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:05:05,516] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 311.47 | bwd_inner_microstep: 311.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:05:08,142] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.23 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 23:05:08,143] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 1178.42 | bwd_inner_microstep: 339.92 | bwd_allreduce_microstep: 838.46 | step_microstep: 1271.11 +[2025-04-26 23:05:08,144] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4823.91 | bwd: 9644.57 | bwd_inner: 8805.64 | bwd_allreduce: 838.59 | step: 1272.24 + 92%|█████████▏| 285/309 [1:23:40<06:45, 16.89s/it] {'loss': 0.2348, 'learning_rate': 6.325243953415117e-07, 'epoch': 2.74} + 92%|█████████▏| 285/309 [1:23:40<06:45, 16.89s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:05:08,617] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 295.93 | bwd_inner_microstep: 295.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 23:05:08,748] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.36 | bwd_microstep: 86.92 | bwd_inner_microstep: 86.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:05:09,235] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 311.24 | bwd_inner_microstep: 311.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:05:09,722] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 311.07 | bwd_inner_microstep: 311.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 23:05:09,852] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.93 | bwd_microstep: 86.21 | bwd_inner_microstep: 86.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:05:10,339] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 311.48 | bwd_inner_microstep: 311.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:05:10,828] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:05:11,315] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:05:11,803] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:05:12,293] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.60 | bwd_microstep: 310.99 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:05:12,782] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:05:13,271] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:05:13,758] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:05:14,247] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 310.74 | bwd_inner_microstep: 310.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 23:05:14,378] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.24 | bwd_microstep: 86.59 | bwd_inner_microstep: 86.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:05:14,865] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1397 +[2025-04-26 23:05:15,239] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.15 | bwd_microstep: 239.84 | bwd_inner_microstep: 239.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1896 +[2025-04-26 23:05:15,725] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 309.64 | bwd_inner_microstep: 309.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 23:05:15,854] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.87 | bwd_microstep: 85.94 | bwd_inner_microstep: 85.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:05:16,341] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:05:16,828] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 309.76 | bwd_inner_microstep: 309.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1977 +[2025-04-26 23:05:17,340] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.77 | bwd_microstep: 328.00 | bwd_inner_microstep: 327.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1974 +[2025-04-26 23:05:17,850] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.66 | bwd_microstep: 327.07 | bwd_inner_microstep: 327.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 23:05:18,357] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.47 | bwd_microstep: 323.78 | bwd_inner_microstep: 323.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1927 +[2025-04-26 23:05:18,865] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.67 | bwd_microstep: 323.31 | bwd_inner_microstep: 323.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 886 +[2025-04-26 23:05:19,117] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.71 | bwd_microstep: 162.62 | bwd_inner_microstep: 162.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:05:19,606] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 312.33 | bwd_inner_microstep: 312.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:05:20,094] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.80 | bwd_microstep: 311.36 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 382 +[2025-04-26 23:05:20,227] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.18 | bwd_microstep: 87.87 | bwd_inner_microstep: 87.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:05:20,714] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 312.43 | bwd_inner_microstep: 312.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:05:21,203] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 311.86 | bwd_inner_microstep: 311.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:05:24,497] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.44 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 23:05:24,497] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 1845.01 | bwd_inner_microstep: 337.53 | bwd_allreduce_microstep: 1507.44 | step_microstep: 1272.31 +[2025-04-26 23:05:24,499] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4737.12 | bwd: 10187.31 | bwd_inner: 8679.40 | bwd_allreduce: 1507.56 | step: 1273.46 + 93%|█████████▎| 286/309 [1:23:57<06:24, 16.73s/it] {'loss': 0.1654, 'learning_rate': 5.811636514789598e-07, 'epoch': 2.75} + 93%|█████████▎| 286/309 [1:23:57<06:24, 16.73s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:05:24,969] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 293.84 | bwd_inner_microstep: 293.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:05:25,458] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 312.40 | bwd_inner_microstep: 312.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:05:25,947] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 311.34 | bwd_inner_microstep: 311.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:05:26,434] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:05:26,922] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:05:27,409] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:05:27,897] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:05:28,385] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:05:28,872] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:05:29,360] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 310.26 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:05:29,848] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 311.01 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:05:30,336] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:05:30,822] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:05:31,310] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:05:31,798] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:05:32,285] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 309.84 | bwd_inner_microstep: 309.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:05:32,772] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:05:33,261] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:05:33,749] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:05:34,236] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:05:34,723] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:05:35,209] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 309.29 | bwd_inner_microstep: 309.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 389 +[2025-04-26 23:05:35,350] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.96 | bwd_microstep: 93.73 | bwd_inner_microstep: 93.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:05:35,838] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 311.34 | bwd_inner_microstep: 311.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 23:05:36,343] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.03 | bwd_microstep: 323.54 | bwd_inner_microstep: 323.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:05:36,833] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.10 | bwd_microstep: 311.21 | bwd_inner_microstep: 311.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:05:37,321] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 311.68 | bwd_inner_microstep: 311.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:05:37,811] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.33 | bwd_microstep: 311.10 | bwd_inner_microstep: 311.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:05:38,300] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 311.30 | bwd_inner_microstep: 311.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1403 +[2025-04-26 23:05:38,674] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.32 | bwd_microstep: 240.85 | bwd_inner_microstep: 240.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:05:39,162] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 311.19 | bwd_inner_microstep: 311.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:05:41,810] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.45 | optimizer_gradients: 17.53 | optimizer_step: 32.04 +[2025-04-26 23:05:41,811] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.80 | bwd_microstep: 1202.32 | bwd_inner_microstep: 339.67 | bwd_allreduce_microstep: 862.61 | step_microstep: 1270.42 +[2025-04-26 23:05:41,812] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5346.11 | bwd: 10543.74 | bwd_inner: 9680.66 | bwd_allreduce: 862.73 | step: 1271.47 + 93%|█████████▎| 287/309 [1:24:14<06:11, 16.90s/it] {'loss': 0.1634, 'learning_rate': 5.31946669649337e-07, 'epoch': 2.76} + 93%|█████████▎| 287/309 [1:24:14<06:11, 16.90s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:05:42,284] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 295.07 | bwd_inner_microstep: 295.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:05:42,772] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 310.40 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:05:43,259] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 311.13 | bwd_inner_microstep: 311.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1908 +[2025-04-26 23:05:43,749] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.79 | bwd_microstep: 311.34 | bwd_inner_microstep: 311.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:05:44,237] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 310.27 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:05:44,724] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:05:45,213] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.47 | bwd_microstep: 311.32 | bwd_inner_microstep: 311.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:05:45,701] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:05:46,189] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:05:46,678] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:05:47,167] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:05:47,654] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:05:48,143] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.53 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:05:48,632] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:05:49,120] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:05:49,608] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:05:50,097] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 878 +[2025-04-26 23:05:50,350] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.67 | bwd_microstep: 162.13 | bwd_inner_microstep: 162.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:05:50,836] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 23:05:50,966] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.05 | bwd_microstep: 85.81 | bwd_inner_microstep: 85.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:05:51,452] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2106 +[2025-04-26 23:05:51,990] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.36 | bwd_microstep: 349.20 | bwd_inner_microstep: 349.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1952 +[2025-04-26 23:05:52,498] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.25 | bwd_microstep: 325.21 | bwd_inner_microstep: 325.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 384 +[2025-04-26 23:05:52,631] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 41.04 | bwd_microstep: 87.56 | bwd_inner_microstep: 87.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1939 +[2025-04-26 23:05:53,136] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.19 | bwd_microstep: 324.08 | bwd_inner_microstep: 324.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:05:53,626] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 313.71 | bwd_inner_microstep: 313.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1404 +[2025-04-26 23:05:54,001] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.72 | bwd_microstep: 240.35 | bwd_inner_microstep: 240.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:05:54,488] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:05:54,978] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 311.93 | bwd_inner_microstep: 311.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:05:55,467] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 311.81 | bwd_inner_microstep: 311.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:05:55,956] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 886 +[2025-04-26 23:05:59,703] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.51 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 23:05:59,704] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.77 | bwd_microstep: 2384.11 | bwd_inner_microstep: 190.79 | bwd_allreduce_microstep: 2193.28 | step_microstep: 1272.20 +[2025-04-26 23:05:59,705] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5055.38 | bwd: 11406.03 | bwd_inner: 9212.29 | bwd_allreduce: 2193.40 | step: 1273.34 + 93%|█████████▎| 288/309 [1:24:32<06:01, 17.20s/it] {'loss': 0.2984, 'learning_rate': 4.848788832110151e-07, 'epoch': 2.77} + 93%|█████████▎| 288/309 [1:24:32<06:01, 17.20s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:06:00,176] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 294.29 | bwd_inner_microstep: 294.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:06:00,663] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 311.33 | bwd_inner_microstep: 311.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:06:01,151] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 309.86 | bwd_inner_microstep: 309.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:06:01,639] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:06:02,127] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.12 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:06:02,614] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:03,102] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:06:03,589] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:06:04,077] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:06:04,565] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:06:05,052] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:06:05,540] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:06:06,028] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:06:06,516] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:07,003] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 310.18 | bwd_inner_microstep: 310.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:06:07,491] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 309.55 | bwd_inner_microstep: 309.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.29 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:06:07,980] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.60 | bwd_microstep: 311.00 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:06:08,467] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.07 | bwd_microstep: 309.74 | bwd_inner_microstep: 309.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:06:08,954] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 23:06:09,083] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.00 | bwd_microstep: 85.79 | bwd_inner_microstep: 85.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:06:09,568] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 309.90 | bwd_inner_microstep: 309.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1955 +[2025-04-26 23:06:10,077] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.48 | bwd_microstep: 325.86 | bwd_inner_microstep: 325.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1986 +[2025-04-26 23:06:10,590] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.09 | bwd_microstep: 329.40 | bwd_inner_microstep: 329.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 417 +[2025-04-26 23:06:10,733] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 43.01 | bwd_microstep: 95.35 | bwd_inner_microstep: 95.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1925 +[2025-04-26 23:06:11,237] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.37 | bwd_microstep: 323.94 | bwd_inner_microstep: 323.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:06:11,726] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 311.48 | bwd_inner_microstep: 311.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:06:12,214] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1402 +[2025-04-26 23:06:12,589] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.88 | bwd_microstep: 240.47 | bwd_inner_microstep: 240.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 23:06:12,728] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 45.89 | bwd_microstep: 86.78 | bwd_inner_microstep: 86.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 377 +[2025-04-26 23:06:12,858] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.49 | bwd_microstep: 86.90 | bwd_inner_microstep: 86.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:06:13,345] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 310.73 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 23:06:16,737] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.08 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 23:06:16,738] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.33 | bwd_microstep: 2077.79 | bwd_inner_microstep: 115.87 | bwd_allreduce_microstep: 1961.88 | step_microstep: 1270.80 +[2025-04-26 23:06:16,739] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4830.72 | bwd: 10775.97 | bwd_inner: 8813.62 | bwd_allreduce: 1962.00 | step: 1272.16 + 94%|█████████▎| 289/309 [1:24:49<05:43, 17.15s/it] {'loss': 0.216, 'learning_rate': 4.399654882597726e-07, 'epoch': 2.78} + 94%|█████████▎| 289/309 [1:24:49<05:43, 17.15s/it]dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 377 +[2025-04-26 23:06:16,852] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 36.83 | bwd_microstep: 69.94 | bwd_inner_microstep: 69.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:17,340] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:17,827] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 310.33 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:06:18,315] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 311.21 | bwd_inner_microstep: 311.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:18,803] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:06:19,290] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 311.11 | bwd_inner_microstep: 311.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:19,776] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 309.95 | bwd_inner_microstep: 309.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:20,263] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 309.93 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:06:20,750] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 310.33 | bwd_inner_microstep: 310.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:06:21,237] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:06:21,725] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 310.14 | bwd_inner_microstep: 310.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:06:22,211] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 309.50 | bwd_inner_microstep: 309.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:22,698] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:06:23,186] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:23,673] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:06:24,160] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.80 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:06:24,648] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:06:25,134] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 310.12 | bwd_inner_microstep: 310.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:06:25,622] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:06:26,107] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.80 | bwd_microstep: 308.81 | bwd_inner_microstep: 308.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:06:26,593] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 309.00 | bwd_inner_microstep: 308.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1974 +[2025-04-26 23:06:27,105] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.03 | bwd_microstep: 327.09 | bwd_inner_microstep: 327.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 940 +[2025-04-26 23:06:27,377] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 92.00 | bwd_microstep: 174.88 | bwd_inner_microstep: 174.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:06:27,865] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.89 | bwd_microstep: 311.98 | bwd_inner_microstep: 311.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:06:28,353] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 311.67 | bwd_inner_microstep: 311.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:06:28,842] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 311.18 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:06:29,330] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:06:29,819] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.95 | bwd_microstep: 310.93 | bwd_inner_microstep: 310.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:06:30,307] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 311.00 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:06:30,796] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.31 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:06:31,284] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:34,420] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.06 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 23:06:34,421] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 1689.91 | bwd_inner_microstep: 340.17 | bwd_allreduce_microstep: 1349.70 | step_microstep: 1270.93 +[2025-04-26 23:06:34,423] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5299.01 | bwd: 10955.35 | bwd_inner: 9605.18 | bwd_allreduce: 1349.83 | step: 1271.99 + 94%|█████████▍| 290/309 [1:25:07<05:28, 17.31s/it] {'loss': 0.1855, 'learning_rate': 3.972114430551632e-07, 'epoch': 2.79} + 94%|█████████▍| 290/309 [1:25:07<05:28, 17.31s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:06:34,894] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 294.39 | bwd_inner_microstep: 294.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:06:35,381] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 310.84 | bwd_inner_microstep: 310.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:35,868] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 310.28 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:36,356] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:36,844] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:06:37,331] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 309.79 | bwd_inner_microstep: 309.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:06:37,819] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 311.16 | bwd_inner_microstep: 311.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1402 +[2025-04-26 23:06:38,194] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.61 | bwd_microstep: 240.64 | bwd_inner_microstep: 240.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:06:38,681] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:06:39,167] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 309.98 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:06:39,655] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 887 +[2025-04-26 23:06:39,908] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.42 | bwd_microstep: 163.25 | bwd_inner_microstep: 163.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 23:06:40,280] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 127.26 | bwd_microstep: 240.32 | bwd_inner_microstep: 240.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:06:40,768] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 311.20 | bwd_inner_microstep: 311.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:06:41,255] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:06:41,743] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.59 | bwd_microstep: 310.40 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:06:42,231] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:06:42,719] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:06:43,206] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:06:43,694] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 310.19 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:06:44,179] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 309.01 | bwd_inner_microstep: 308.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:06:44,664] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 308.99 | bwd_inner_microstep: 308.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 23:06:45,172] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.03 | bwd_microstep: 323.75 | bwd_inner_microstep: 323.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:06:45,660] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:06:46,149] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 311.45 | bwd_inner_microstep: 311.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:06:46,638] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 311.12 | bwd_inner_microstep: 311.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:06:47,125] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 23:06:47,257] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.61 | bwd_microstep: 87.13 | bwd_inner_microstep: 87.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:06:47,745] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 312.21 | bwd_inner_microstep: 312.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:06:48,235] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 312.90 | bwd_inner_microstep: 312.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:06:48,724] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 311.38 | bwd_inner_microstep: 311.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:06:52,421] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.54 | optimizer_gradients: 17.51 | optimizer_step: 32.04 +[2025-04-26 23:06:52,422] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.15 | bwd_microstep: 2251.04 | bwd_inner_microstep: 339.46 | bwd_allreduce_microstep: 1911.54 | step_microstep: 1269.22 +[2025-04-26 23:06:52,423] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5206.91 | bwd: 11367.57 | bwd_inner: 9455.56 | bwd_allreduce: 1911.66 | step: 1270.29 + 94%|█████████▍| 291/309 [1:25:25<05:15, 17.52s/it] {'loss': 0.138, 'learning_rate': 3.5662146747315054e-07, 'epoch': 2.8} + 94%|█████████▍| 291/309 [1:25:25<05:15, 17.52s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:06:52,895] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 294.72 | bwd_inner_microstep: 294.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:06:53,383] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:06:53,870] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 23:06:54,001] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.21 | bwd_microstep: 86.28 | bwd_inner_microstep: 86.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:06:54,487] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 311.60 | bwd_inner_microstep: 311.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:54,975] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:06:55,462] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 23:06:55,592] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.88 | bwd_microstep: 85.95 | bwd_inner_microstep: 85.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:06:55,720] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.71 | bwd_microstep: 86.64 | bwd_inner_microstep: 86.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:06:56,206] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 309.82 | bwd_inner_microstep: 309.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:06:56,694] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 311.70 | bwd_inner_microstep: 311.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:06:57,181] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 309.90 | bwd_inner_microstep: 309.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:06:57,669] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.74 | bwd_inner_microstep: 310.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:06:58,158] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:06:58,646] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 311.03 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:06:59,134] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:06:59,621] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 309.97 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.06 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:07:00,109] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 309.64 | bwd_inner_microstep: 309.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:07:00,595] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:07:01,083] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 310.06 | bwd_inner_microstep: 310.04 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:07:01,569] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 309.30 | bwd_inner_microstep: 309.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:07:02,056] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 309.79 | bwd_inner_microstep: 309.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 23:07:02,562] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.32 | bwd_microstep: 323.20 | bwd_inner_microstep: 323.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1924 +[2025-04-26 23:07:03,069] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.46 | bwd_microstep: 323.05 | bwd_inner_microstep: 323.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 383 +[2025-04-26 23:07:03,201] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.12 | bwd_microstep: 87.48 | bwd_inner_microstep: 87.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:07:03,688] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 312.11 | bwd_inner_microstep: 312.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:07:04,176] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 311.71 | bwd_inner_microstep: 311.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:07:04,666] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 311.95 | bwd_inner_microstep: 311.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:07:05,153] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:07:05,641] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 311.20 | bwd_inner_microstep: 311.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:07:06,130] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.30 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:07:09,104] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.37 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 23:07:09,104] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 1524.71 | bwd_inner_microstep: 339.68 | bwd_allreduce_microstep: 1184.99 | step_microstep: 1271.97 +[2025-04-26 23:07:09,105] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4985.44 | bwd: 10268.40 | bwd_inner: 9082.94 | bwd_allreduce: 1185.11 | step: 1273.07 + 94%|█████████▍| 292/309 [1:25:41<04:53, 17.27s/it] {'loss': 0.2327, 'learning_rate': 3.1820004248503957e-07, 'epoch': 2.81} + 94%|█████████▍| 292/309 [1:25:41<04:53, 17.27s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:07:09,576] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 294.87 | bwd_inner_microstep: 294.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:07:10,064] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 23:07:10,196] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.16 | bwd_microstep: 86.73 | bwd_inner_microstep: 86.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:07:10,681] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 310.73 | bwd_inner_microstep: 310.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:11,169] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 310.55 | bwd_inner_microstep: 310.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:07:11,656] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:07:12,143] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:07:12,631] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:13,119] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:13,608] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.02 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:14,096] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:07:14,583] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 309.77 | bwd_inner_microstep: 309.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:07:14,713] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.83 | bwd_microstep: 86.62 | bwd_inner_microstep: 86.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:15,200] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:15,688] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:07:16,176] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 23:07:16,548] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.78 | bwd_microstep: 239.22 | bwd_inner_microstep: 239.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:07:17,034] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:07:17,524] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.51 | bwd_microstep: 311.22 | bwd_inner_microstep: 311.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:07:18,011] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 23:07:18,497] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 309.21 | bwd_inner_microstep: 309.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:07:18,983] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 308.95 | bwd_inner_microstep: 308.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1976 +[2025-04-26 23:07:19,494] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.80 | bwd_microstep: 327.22 | bwd_inner_microstep: 327.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 23:07:20,002] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.89 | bwd_microstep: 324.17 | bwd_inner_microstep: 324.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 23:07:20,508] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.82 | bwd_microstep: 322.85 | bwd_inner_microstep: 322.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 23:07:20,640] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 41.45 | bwd_microstep: 86.93 | bwd_inner_microstep: 86.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:07:21,128] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 311.49 | bwd_inner_microstep: 311.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:07:21,617] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 312.09 | bwd_inner_microstep: 312.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1403 +[2025-04-26 23:07:21,992] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.09 | bwd_microstep: 240.72 | bwd_inner_microstep: 240.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:07:22,480] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 311.10 | bwd_inner_microstep: 311.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:07:22,969] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 311.68 | bwd_inner_microstep: 311.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:07:27,147] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.32 | optimizer_gradients: 17.53 | optimizer_step: 32.06 +[2025-04-26 23:07:27,147] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.37 | bwd_microstep: 2729.93 | bwd_inner_microstep: 340.38 | bwd_allreduce_microstep: 2389.52 | step_microstep: 1270.11 +[2025-04-26 23:07:27,149] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5044.04 | bwd: 11574.26 | bwd_inner: 9184.27 | bwd_allreduce: 2389.63 | step: 1271.20 + 95%|█████████▍| 293/309 [1:25:59<04:39, 17.50s/it] {'loss': 0.19, 'learning_rate': 2.8195140966281285e-07, 'epoch': 2.82} + 95%|█████████▍| 293/309 [1:25:59<04:39, 17.50s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:07:27,619] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 293.62 | bwd_inner_microstep: 293.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:07:28,106] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 311.23 | bwd_inner_microstep: 311.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:07:28,593] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:07:28,724] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.90 | bwd_microstep: 86.20 | bwd_inner_microstep: 86.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.06 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:07:29,210] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 311.11 | bwd_inner_microstep: 311.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:29,698] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 310.66 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:07:30,186] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:07:30,674] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:07:31,161] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 309.97 | bwd_inner_microstep: 309.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:07:31,650] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 310.46 | bwd_inner_microstep: 310.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:32,137] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:32,625] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:07:33,112] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:07:33,600] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:07:34,088] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:34,576] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:07:35,063] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 309.86 | bwd_inner_microstep: 309.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 23:07:35,193] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.91 | bwd_microstep: 85.79 | bwd_inner_microstep: 85.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:07:35,679] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:07:36,166] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:07:36,652] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 308.87 | bwd_inner_microstep: 308.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2155 +[2025-04-26 23:07:37,196] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.35 | bwd_microstep: 354.23 | bwd_inner_microstep: 354.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1929 +[2025-04-26 23:07:37,703] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.69 | bwd_microstep: 324.16 | bwd_inner_microstep: 324.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 23:07:38,208] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.32 | bwd_microstep: 321.97 | bwd_inner_microstep: 321.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 23:07:38,714] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.92 | bwd_microstep: 322.99 | bwd_inner_microstep: 322.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:07:39,203] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.83 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:07:39,691] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 310.94 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:07:40,180] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.07 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:07:40,669] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 310.68 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:07:41,157] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.35 | bwd_microstep: 310.62 | bwd_inner_microstep: 310.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:07:41,645] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:44,885] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.30 | optimizer_gradients: 17.51 | optimizer_step: 32.03 +[2025-04-26 23:07:44,886] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 1791.11 | bwd_inner_microstep: 339.12 | bwd_allreduce_microstep: 1451.96 | step_microstep: 1272.00 +[2025-04-26 23:07:44,887] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5268.89 | bwd: 11030.09 | bwd_inner: 9577.67 | bwd_allreduce: 1452.07 | step: 1273.17 + 95%|█████████▌| 294/309 [1:26:17<04:23, 17.57s/it] {'loss': 0.2633, 'learning_rate': 2.478795707108672e-07, 'epoch': 2.83} + 95%|█████████▌| 294/309 [1:26:17<04:23, 17.57s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:07:45,359] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 294.91 | bwd_inner_microstep: 294.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:07:45,846] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:07:46,332] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 369 +[2025-04-26 23:07:46,463] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.84 | bwd_microstep: 86.04 | bwd_inner_microstep: 86.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:07:46,950] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 311.72 | bwd_inner_microstep: 311.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:07:47,437] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 310.49 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:47,923] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 310.03 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:07:48,412] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:07:48,543] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.78 | bwd_microstep: 86.77 | bwd_inner_microstep: 86.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 23:07:48,671] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.91 | bwd_microstep: 86.50 | bwd_inner_microstep: 86.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:49,157] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 309.93 | bwd_inner_microstep: 309.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:49,645] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 311.42 | bwd_inner_microstep: 311.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:50,133] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 310.47 | bwd_inner_microstep: 310.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:50,620] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:07:51,107] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 309.75 | bwd_inner_microstep: 309.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:07:51,594] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 309.71 | bwd_inner_microstep: 309.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 23:07:51,725] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.96 | bwd_microstep: 86.79 | bwd_inner_microstep: 86.77 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:07:52,213] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 311.58 | bwd_inner_microstep: 311.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:07:52,700] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 310.31 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 367 +[2025-04-26 23:07:52,830] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.83 | bwd_microstep: 86.03 | bwd_inner_microstep: 86.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 367 +[2025-04-26 23:07:52,958] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.81 | bwd_microstep: 85.80 | bwd_inner_microstep: 85.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:07:53,442] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 309.20 | bwd_inner_microstep: 309.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1998 +[2025-04-26 23:07:53,958] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.16 | bwd_microstep: 331.55 | bwd_inner_microstep: 331.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1940 +[2025-04-26 23:07:54,465] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.78 | bwd_microstep: 323.99 | bwd_inner_microstep: 323.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:07:54,953] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 311.07 | bwd_inner_microstep: 311.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:07:55,441] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 310.26 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:07:55,931] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 311.68 | bwd_inner_microstep: 311.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 894 +[2025-04-26 23:07:56,185] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.85 | bwd_microstep: 163.33 | bwd_inner_microstep: 163.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:07:56,673] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 311.75 | bwd_inner_microstep: 311.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:07:57,162] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 311.75 | bwd_inner_microstep: 311.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:07:57,652] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 311.85 | bwd_inner_microstep: 311.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:08:02,698] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.50 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 23:08:02,699] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 3599.24 | bwd_inner_microstep: 338.67 | bwd_allreduce_microstep: 3260.54 | step_microstep: 1270.31 +[2025-04-26 23:08:02,700] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4626.83 | bwd: 11755.96 | bwd_inner: 8494.95 | bwd_allreduce: 3260.65 | step: 1271.43 + 95%|█████████▌| 295/309 [1:26:35<04:07, 17.64s/it] {'loss': 0.252, 'learning_rate': 2.1598828702424467e-07, 'epoch': 2.84} + 95%|█████████▌| 295/309 [1:26:35<04:07, 17.64s/it]dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 23:08:02,813] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.14 | bwd_microstep: 69.64 | bwd_inner_microstep: 69.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 886 +[2025-04-26 23:08:03,063] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 84.07 | bwd_microstep: 161.96 | bwd_inner_microstep: 161.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:08:03,548] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 309.44 | bwd_inner_microstep: 309.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:08:04,035] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 309.84 | bwd_inner_microstep: 309.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:08:04,522] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 309.82 | bwd_inner_microstep: 309.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:08:05,007] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 309.09 | bwd_inner_microstep: 309.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:08:05,496] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 310.74 | bwd_inner_microstep: 310.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 378 +[2025-04-26 23:08:05,627] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.18 | bwd_microstep: 86.58 | bwd_inner_microstep: 86.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:08:06,112] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:08:06,600] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 311.18 | bwd_inner_microstep: 311.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:07,088] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:08:07,575] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 309.96 | bwd_inner_microstep: 309.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:08:08,062] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:08:08,550] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 309.94 | bwd_inner_microstep: 309.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:08:09,037] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 310.40 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:08:09,524] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 309.60 | bwd_inner_microstep: 309.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:10,011] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:08:10,499] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.93 | bwd_microstep: 309.48 | bwd_inner_microstep: 309.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:08:10,985] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 309.58 | bwd_inner_microstep: 309.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:08:11,471] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 309.20 | bwd_inner_microstep: 309.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:08:11,957] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 308.62 | bwd_inner_microstep: 308.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:08:12,443] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 308.71 | bwd_inner_microstep: 308.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 23:08:12,949] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.84 | bwd_microstep: 323.31 | bwd_inner_microstep: 323.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 392 +[2025-04-26 23:08:13,090] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 42.48 | bwd_microstep: 94.22 | bwd_inner_microstep: 94.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:08:13,577] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 312.01 | bwd_inner_microstep: 312.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:08:14,065] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:08:14,553] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:08:15,043] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 311.95 | bwd_inner_microstep: 311.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1405 +[2025-04-26 23:08:15,416] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.86 | bwd_microstep: 240.03 | bwd_inner_microstep: 240.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:08:15,903] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:16,390] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:08:19,991] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.53 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 23:08:19,991] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 2151.70 | bwd_inner_microstep: 338.15 | bwd_allreduce_microstep: 1813.51 | step_microstep: 1272.18 +[2025-04-26 23:08:19,993] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4976.15 | bwd: 10881.07 | bwd_inner: 9067.10 | bwd_allreduce: 1813.62 | step: 1273.35 + 96%|█████████▌| 296/309 [1:26:52<03:48, 17.54s/it] {'loss': 0.1745, 'learning_rate': 1.862810792733849e-07, 'epoch': 2.85} + 96%|█████████▌| 296/309 [1:26:52<03:48, 17.54s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:08:20,463] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 294.22 | bwd_inner_microstep: 294.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:20,951] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 310.79 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:08:21,439] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 310.75 | bwd_inner_microstep: 310.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:21,926] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 309.65 | bwd_inner_microstep: 309.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:08:22,412] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 309.88 | bwd_inner_microstep: 309.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 885 +[2025-04-26 23:08:22,665] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.95 | bwd_microstep: 162.42 | bwd_inner_microstep: 162.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:08:23,151] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 311.38 | bwd_inner_microstep: 311.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:23,638] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 310.03 | bwd_inner_microstep: 310.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:24,125] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 310.04 | bwd_inner_microstep: 310.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:08:24,613] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:08:25,101] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:08:25,588] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 309.39 | bwd_inner_microstep: 309.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:08:26,075] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 309.72 | bwd_inner_microstep: 309.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:08:26,563] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 310.12 | bwd_inner_microstep: 310.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1397 +[2025-04-26 23:08:26,936] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.40 | bwd_microstep: 239.39 | bwd_inner_microstep: 239.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:08:27,423] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:27,910] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 310.34 | bwd_inner_microstep: 310.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:08:28,399] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:08:28,886] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 309.47 | bwd_inner_microstep: 309.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:08:29,373] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:08:29,862] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:08:30,347] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 309.32 | bwd_inner_microstep: 309.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1942 +[2025-04-26 23:08:30,854] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.54 | bwd_microstep: 323.82 | bwd_inner_microstep: 323.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 23:08:31,360] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.41 | bwd_microstep: 322.59 | bwd_inner_microstep: 322.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1927 +[2025-04-26 23:08:31,865] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.51 | bwd_microstep: 322.93 | bwd_inner_microstep: 322.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:08:32,354] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.93 | bwd_microstep: 310.97 | bwd_inner_microstep: 310.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1908 +[2025-04-26 23:08:32,844] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 309.68 | bwd_inner_microstep: 309.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:08:33,333] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.06 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:08:33,823] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.28 | bwd_microstep: 310.84 | bwd_inner_microstep: 310.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:08:34,311] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 310.95 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:08:34,799] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.12 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:08:37,310] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.95 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 23:08:37,311] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 1064.13 | bwd_inner_microstep: 530.17 | bwd_allreduce_microstep: 533.52 | step_microstep: 1269.70 +[2025-04-26 23:08:37,312] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5390.44 | bwd: 10486.89 | bwd_inner: 9952.61 | bwd_allreduce: 533.52 | step: 1270.90 + 96%|█████████▌| 297/309 [1:27:09<03:29, 17.47s/it] {'loss': 0.2287, 'learning_rate': 1.5876122701546481e-07, 'epoch': 2.86} + 96%|█████████▌| 297/309 [1:27:09<03:29, 17.47s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:08:37,784] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 294.49 | bwd_inner_microstep: 294.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:08:38,273] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.53 | bwd_microstep: 311.93 | bwd_inner_microstep: 311.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:08:38,760] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 309.65 | bwd_inner_microstep: 309.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:08:39,246] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:08:39,735] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 311.15 | bwd_inner_microstep: 311.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 886 +[2025-04-26 23:08:39,987] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.90 | bwd_microstep: 162.64 | bwd_inner_microstep: 162.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:40,474] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 382 +[2025-04-26 23:08:40,605] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.86 | bwd_microstep: 87.31 | bwd_inner_microstep: 87.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:08:41,091] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 311.18 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:41,578] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.19 | bwd_inner_microstep: 310.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:42,066] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:08:42,553] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 309.53 | bwd_inner_microstep: 309.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:43,042] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.77 | bwd_microstep: 310.91 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:43,529] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 23:08:43,660] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.92 | bwd_microstep: 86.22 | bwd_inner_microstep: 86.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:08:44,146] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 311.56 | bwd_inner_microstep: 311.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:08:44,633] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:08:45,120] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 310.35 | bwd_inner_microstep: 310.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:08:45,608] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:08:46,095] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 23:08:46,225] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.81 | bwd_microstep: 85.68 | bwd_inner_microstep: 85.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 358 +[2025-04-26 23:08:46,352] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.75 | bwd_microstep: 85.29 | bwd_inner_microstep: 85.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1965 +[2025-04-26 23:08:46,861] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.08 | bwd_microstep: 326.33 | bwd_inner_microstep: 326.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:08:47,350] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 313.08 | bwd_inner_microstep: 313.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 23:08:47,482] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.10 | bwd_microstep: 87.06 | bwd_inner_microstep: 87.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:08:47,969] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 311.77 | bwd_inner_microstep: 311.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:08:48,459] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.95 | bwd_microstep: 311.69 | bwd_inner_microstep: 311.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:08:48,947] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 311.37 | bwd_inner_microstep: 311.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:08:49,435] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 310.91 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:08:49,924] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 892 +[2025-04-26 23:08:50,178] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.23 | bwd_microstep: 163.81 | bwd_inner_microstep: 163.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:08:54,592] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.64 | optimizer_gradients: 17.52 | optimizer_step: 32.05 +[2025-04-26 23:08:54,593] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 2967.58 | bwd_inner_microstep: 339.31 | bwd_allreduce_microstep: 2628.23 | step_microstep: 1272.47 +[2025-04-26 23:08:54,594] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4670.76 | bwd: 11184.74 | bwd_inner: 8556.04 | bwd_allreduce: 2628.35 | step: 1273.53 + 96%|█████████▋| 298/309 [1:27:27<03:11, 17.42s/it] {'loss': 0.2116, 'learning_rate': 1.3343176833234161e-07, 'epoch': 2.87} + 96%|█████████▋| 298/309 [1:27:27<03:11, 17.42s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:08:55,063] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 293.07 | bwd_inner_microstep: 293.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:08:55,551] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 312.00 | bwd_inner_microstep: 311.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:08:56,039] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.33 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:56,525] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 309.76 | bwd_inner_microstep: 309.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:08:57,013] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.21 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:57,499] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 309.81 | bwd_inner_microstep: 309.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:08:57,630] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.92 | bwd_microstep: 86.49 | bwd_inner_microstep: 86.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 367 +[2025-04-26 23:08:57,757] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.79 | bwd_microstep: 85.52 | bwd_inner_microstep: 85.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:08:58,244] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:58,731] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 310.50 | bwd_inner_microstep: 310.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:08:59,217] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:08:59,705] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 309.71 | bwd_inner_microstep: 309.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:00,193] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:00,681] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.00 | bwd_microstep: 310.38 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:01,169] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 310.21 | bwd_inner_microstep: 310.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:09:01,655] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 309.68 | bwd_inner_microstep: 309.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:02,143] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.38 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:09:02,630] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.12 | bwd_inner_microstep: 310.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:09:03,118] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:09:03,605] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 309.87 | bwd_inner_microstep: 309.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:09:04,092] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2238 +[2025-04-26 23:09:04,644] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.94 | bwd_microstep: 359.14 | bwd_inner_microstep: 359.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1975 +[2025-04-26 23:09:05,156] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.57 | bwd_microstep: 327.20 | bwd_inner_microstep: 327.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 23:09:05,662] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.67 | bwd_microstep: 323.26 | bwd_inner_microstep: 323.24 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:09:05,793] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.05 | bwd_microstep: 86.51 | bwd_inner_microstep: 86.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:09:06,280] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 312.04 | bwd_inner_microstep: 312.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:09:06,768] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 311.59 | bwd_inner_microstep: 311.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:09:07,256] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 310.92 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:09:07,745] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:09:08,233] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.73 | bwd_microstep: 310.17 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:09:08,721] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 311.20 | bwd_inner_microstep: 311.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:09:11,941] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.58 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 23:09:11,942] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 1772.08 | bwd_inner_microstep: 337.40 | bwd_allreduce_microstep: 1434.64 | step_microstep: 1270.37 +[2025-04-26 23:09:11,943] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5133.41 | bwd: 10785.61 | bwd_inner: 9350.51 | bwd_allreduce: 1434.76 | step: 1271.49 + 97%|█████████▋| 299/309 [1:27:44<02:53, 17.40s/it] {'loss': 0.227, 'learning_rate': 1.1029549949516549e-07, 'epoch': 2.88} + 97%|█████████▋| 299/309 [1:27:44<02:53, 17.40s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1907 +[2025-04-26 23:09:12,413] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 293.59 | bwd_inner_microstep: 293.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:09:12,901] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 311.09 | bwd_inner_microstep: 311.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:09:13,387] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 309.41 | bwd_inner_microstep: 309.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:09:13,875] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 311.06 | bwd_inner_microstep: 311.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:09:14,363] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.27 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:09:14,850] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 310.14 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:09:15,337] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 309.99 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:09:15,827] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 311.76 | bwd_inner_microstep: 311.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:16,314] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:16,800] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 310.36 | bwd_inner_microstep: 310.35 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:09:17,287] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.15 | bwd_microstep: 309.42 | bwd_inner_microstep: 309.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:09:17,774] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 310.13 | bwd_inner_microstep: 310.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:09:18,263] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 311.44 | bwd_inner_microstep: 311.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:18,751] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 23:09:19,124] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.71 | bwd_microstep: 239.73 | bwd_inner_microstep: 239.72 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 368 +[2025-04-26 23:09:19,253] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 38.50 | bwd_microstep: 86.03 | bwd_inner_microstep: 86.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:09:19,739] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 310.03 | bwd_inner_microstep: 310.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:09:20,227] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:09:20,712] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 309.58 | bwd_inner_microstep: 309.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:09:21,200] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 310.12 | bwd_inner_microstep: 310.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:09:21,687] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 309.95 | bwd_inner_microstep: 309.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2017 +[2025-04-26 23:09:22,205] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.70 | bwd_microstep: 332.88 | bwd_inner_microstep: 332.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1950 +[2025-04-26 23:09:22,712] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.29 | bwd_microstep: 324.79 | bwd_inner_microstep: 324.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 23:09:23,218] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.64 | bwd_microstep: 323.03 | bwd_inner_microstep: 323.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 23:09:23,724] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.80 | bwd_microstep: 323.35 | bwd_inner_microstep: 323.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:09:24,214] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 311.79 | bwd_inner_microstep: 311.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 23:09:24,345] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.24 | bwd_microstep: 87.33 | bwd_inner_microstep: 87.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:09:24,832] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 311.21 | bwd_inner_microstep: 311.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 378 +[2025-04-26 23:09:24,964] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.52 | bwd_microstep: 86.96 | bwd_inner_microstep: 86.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:09:25,451] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:25,940] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:09:28,725] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.32 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 23:09:28,726] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.19 | bwd_microstep: 1336.66 | bwd_inner_microstep: 338.59 | bwd_allreduce_microstep: 998.03 | step_microstep: 1272.05 +[2025-04-26 23:09:28,727] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5094.33 | bwd: 10266.84 | bwd_inner: 9268.35 | bwd_allreduce: 998.15 | step: 1273.12 + 97%|█████████▋| 300/309 [1:28:01<02:34, 17.21s/it] {'loss': 0.2794, 'learning_rate': 8.935497465567989e-08, 'epoch': 2.89} + 97%|█████████▋| 300/309 [1:28:01<02:34, 17.21s/it]dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 377 +[2025-04-26 23:09:28,839] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 36.15 | bwd_microstep: 70.07 | bwd_inner_microstep: 70.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:09:29,325] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 310.01 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:29,812] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 311.94 | bwd_inner_microstep: 311.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1908 +[2025-04-26 23:09:30,300] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:09:30,786] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 309.89 | bwd_inner_microstep: 309.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:31,274] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 310.29 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:09:31,762] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:09:32,250] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:09:32,737] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 310.48 | bwd_inner_microstep: 310.47 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:33,225] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 310.16 | bwd_inner_microstep: 310.15 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:09:33,712] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.31 | bwd_inner_microstep: 310.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:09:34,200] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 310.95 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:34,688] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.59 | bwd_microstep: 310.70 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:35,176] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:35,664] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.93 | bwd_microstep: 310.85 | bwd_inner_microstep: 310.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:36,152] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:09:36,639] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:09:37,126] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 309.96 | bwd_inner_microstep: 309.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:09:37,613] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 310.26 | bwd_inner_microstep: 310.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:09:38,101] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.07 | bwd_microstep: 309.90 | bwd_inner_microstep: 309.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:09:38,588] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 309.33 | bwd_inner_microstep: 309.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1998 +[2025-04-26 23:09:39,102] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.42 | bwd_microstep: 330.06 | bwd_inner_microstep: 330.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1976 +[2025-04-26 23:09:39,613] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.44 | bwd_microstep: 327.07 | bwd_inner_microstep: 327.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:09:40,103] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.48 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:09:40,592] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 892 +[2025-04-26 23:09:40,846] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.28 | bwd_microstep: 163.21 | bwd_inner_microstep: 163.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:09:41,335] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 312.31 | bwd_inner_microstep: 312.29 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:09:41,824] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 311.01 | bwd_inner_microstep: 311.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:09:42,314] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 312.14 | bwd_inner_microstep: 312.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:09:42,804] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.17 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1403 +[2025-04-26 23:09:43,179] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.29 | bwd_microstep: 240.75 | bwd_inner_microstep: 240.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:09:45,638] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.44 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 23:09:45,639] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 1013.43 | bwd_inner_microstep: 624.29 | bwd_allreduce_microstep: 389.11 | step_microstep: 1270.27 +[2025-04-26 23:09:45,640] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5261.06 | bwd: 10221.85 | bwd_inner: 9832.28 | bwd_allreduce: 389.23 | step: 1271.38 + 97%|█████████▋| 301/309 [1:28:18<02:16, 17.12s/it] {'loss': 0.2103, 'learning_rate': 7.06125055642537e-08, 'epoch': 2.9} + 97%|█████████▋| 301/309 [1:28:18<02:16, 17.12s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:09:46,112] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 295.40 | bwd_inner_microstep: 295.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:09:46,601] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:09:47,089] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 310.75 | bwd_inner_microstep: 310.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:09:47,578] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:09:48,066] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 310.53 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:09:48,554] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 310.73 | bwd_inner_microstep: 310.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:09:49,043] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.88 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:09:49,530] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 310.87 | bwd_inner_microstep: 310.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:09:50,019] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.90 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:09:50,507] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 310.42 | bwd_inner_microstep: 310.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 367 +[2025-04-26 23:09:50,637] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.92 | bwd_microstep: 85.88 | bwd_inner_microstep: 85.86 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:51,123] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 311.15 | bwd_inner_microstep: 311.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:09:51,611] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 310.40 | bwd_inner_microstep: 310.39 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:52,099] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:52,588] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 310.97 | bwd_inner_microstep: 310.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:09:53,076] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.07 | bwd_microstep: 309.99 | bwd_inner_microstep: 309.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:09:53,564] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:09:54,053] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 310.59 | bwd_inner_microstep: 310.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:09:54,541] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:09:55,028] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:09:55,516] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 309.14 | bwd_inner_microstep: 309.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:09:56,003] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 310.01 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1952 +[2025-04-26 23:09:56,512] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.38 | bwd_microstep: 324.94 | bwd_inner_microstep: 324.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1929 +[2025-04-26 23:09:57,018] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.65 | bwd_microstep: 323.57 | bwd_inner_microstep: 323.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1922 +[2025-04-26 23:09:57,525] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.48 | bwd_microstep: 322.81 | bwd_inner_microstep: 322.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:09:58,015] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.15 | bwd_microstep: 311.32 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:09:58,505] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.21 | bwd_microstep: 311.70 | bwd_inner_microstep: 311.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:09:58,994] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.23 | bwd_microstep: 311.29 | bwd_inner_microstep: 311.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:09:59,485] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.42 | bwd_microstep: 312.01 | bwd_inner_microstep: 312.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:09:59,976] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.59 | bwd_microstep: 312.12 | bwd_inner_microstep: 312.11 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:10:00,465] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 311.71 | bwd_inner_microstep: 311.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:10:03,249] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1205.37 | optimizer_gradients: 17.54 | optimizer_step: 32.04 +[2025-04-26 23:10:03,249] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.66 | bwd_microstep: 1466.38 | bwd_inner_microstep: 116.11 | bwd_allreduce_microstep: 1350.22 | step_microstep: 1273.15 +[2025-04-26 23:10:03,251] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5269.77 | bwd: 10899.79 | bwd_inner: 9549.09 | bwd_allreduce: 1350.35 | step: 1274.26 + 98%|█████████▊| 302/309 [1:28:35<02:00, 17.27s/it] {'loss': 0.2188, 'learning_rate': 5.4070161314676574e-08, 'epoch': 2.91} + 98%|█████████▊| 302/309 [1:28:35<02:00, 17.27s/it]dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 370 +[2025-04-26 23:10:03,362] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 36.41 | bwd_microstep: 69.35 | bwd_inner_microstep: 69.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:10:03,848] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:10:04,337] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 312.35 | bwd_inner_microstep: 312.34 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:10:04,824] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:10:05,311] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 310.98 | bwd_inner_microstep: 310.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:10:05,799] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.19 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:10:06,287] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:10:06,775] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 310.30 | bwd_inner_microstep: 310.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:10:07,262] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:10:07,750] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:10:08,238] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 310.56 | bwd_inner_microstep: 310.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:10:08,727] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.10 | bwd_microstep: 309.88 | bwd_inner_microstep: 309.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1398 +[2025-04-26 23:10:09,100] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.94 | bwd_microstep: 240.10 | bwd_inner_microstep: 240.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:10:09,587] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:10:10,075] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 311.31 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:10:10,564] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 311.54 | bwd_inner_microstep: 311.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 881 +[2025-04-26 23:10:10,818] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.66 | bwd_microstep: 162.59 | bwd_inner_microstep: 162.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:10:11,303] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 309.92 | bwd_inner_microstep: 309.91 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:10:11,790] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 311.05 | bwd_inner_microstep: 311.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 879 +[2025-04-26 23:10:12,043] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.89 | bwd_microstep: 162.34 | bwd_inner_microstep: 162.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:10:12,530] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 868 +[2025-04-26 23:10:12,781] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.69 | bwd_microstep: 161.15 | bwd_inner_microstep: 161.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 402 +[2025-04-26 23:10:12,921] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 41.98 | bwd_microstep: 94.66 | bwd_inner_microstep: 94.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 23:10:13,426] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.65 | bwd_microstep: 323.48 | bwd_inner_microstep: 323.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 23:10:13,933] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.32 | bwd_microstep: 323.98 | bwd_inner_microstep: 323.96 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:10:14,423] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 312.31 | bwd_inner_microstep: 312.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1402 +[2025-04-26 23:10:14,797] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 129.49 | bwd_microstep: 239.61 | bwd_inner_microstep: 239.60 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 378 +[2025-04-26 23:10:14,928] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.48 | bwd_microstep: 87.50 | bwd_inner_microstep: 87.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:10:15,415] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 311.27 | bwd_inner_microstep: 311.25 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:10:15,905] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.93 | bwd_microstep: 311.90 | bwd_inner_microstep: 311.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:10:16,394] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.54 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 886 +[2025-04-26 23:10:20,896] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.86 | optimizer_gradients: 17.53 | optimizer_step: 32.05 +[2025-04-26 23:10:20,896] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.73 | bwd_microstep: 3141.98 | bwd_inner_microstep: 191.23 | bwd_allreduce_microstep: 2950.71 | step_microstep: 1269.61 +[2025-04-26 23:10:20,898] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4690.59 | bwd: 11537.53 | bwd_inner: 8586.36 | bwd_allreduce: 2950.83 | step: 1270.76 + 98%|█████████▊| 303/309 [1:28:53<01:44, 17.38s/it] {'loss': 0.2068, 'learning_rate': 3.972976811573048e-08, 'epoch': 2.92} + 98%|█████████▊| 303/309 [1:28:53<01:44, 17.38s/it]dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:10:21,010] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 36.36 | bwd_microstep: 69.86 | bwd_inner_microstep: 69.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:10:21,495] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 310.31 | bwd_inner_microstep: 310.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 378 +[2025-04-26 23:10:21,626] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.61 | bwd_microstep: 86.33 | bwd_inner_microstep: 86.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:10:22,114] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 311.57 | bwd_inner_microstep: 311.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:10:22,600] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:10:23,088] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 311.13 | bwd_inner_microstep: 311.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:10:23,574] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 309.77 | bwd_inner_microstep: 309.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:10:24,061] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 309.85 | bwd_inner_microstep: 309.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:10:24,549] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.37 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:10:25,036] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.59 | bwd_microstep: 310.06 | bwd_inner_microstep: 310.05 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:10:25,167] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.05 | bwd_microstep: 86.55 | bwd_inner_microstep: 86.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 374 +[2025-04-26 23:10:25,296] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.87 | bwd_microstep: 86.90 | bwd_inner_microstep: 86.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:10:25,783] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 312.08 | bwd_inner_microstep: 312.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:10:26,270] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 311.04 | bwd_inner_microstep: 311.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:10:26,756] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.53 | bwd_microstep: 309.90 | bwd_inner_microstep: 309.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 870 +[2025-04-26 23:10:27,007] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.91 | bwd_microstep: 161.33 | bwd_inner_microstep: 161.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:10:27,493] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 310.90 | bwd_inner_microstep: 310.89 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:10:27,979] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 309.59 | bwd_inner_microstep: 309.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 869 +[2025-04-26 23:10:28,230] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 85.63 | bwd_microstep: 161.48 | bwd_inner_microstep: 161.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1892 +[2025-04-26 23:10:28,716] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1894 +[2025-04-26 23:10:29,201] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 309.02 | bwd_inner_microstep: 309.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 23:10:29,706] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.77 | bwd_microstep: 322.94 | bwd_inner_microstep: 322.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1952 +[2025-04-26 23:10:30,215] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.86 | bwd_microstep: 324.95 | bwd_inner_microstep: 324.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1938 +[2025-04-26 23:10:30,721] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.73 | bwd_microstep: 323.58 | bwd_inner_microstep: 323.57 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:10:31,210] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:10:31,698] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.53 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 383 +[2025-04-26 23:10:31,829] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.23 | bwd_microstep: 87.34 | bwd_inner_microstep: 87.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:10:32,316] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 311.51 | bwd_inner_microstep: 311.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:10:32,805] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 311.68 | bwd_inner_microstep: 311.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:10:33,293] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 311.10 | bwd_inner_microstep: 311.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:10:33,782] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 311.50 | bwd_inner_microstep: 311.49 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:10:38,315] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1204.54 | optimizer_gradients: 17.52 | optimizer_step: 32.04 +[2025-04-26 23:10:38,315] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.16 | bwd_microstep: 3083.09 | bwd_inner_microstep: 338.63 | bwd_allreduce_microstep: 2744.42 | step_microstep: 1272.31 +[2025-04-26 23:10:38,317] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4680.99 | bwd: 11317.52 | bwd_inner: 8572.63 | bwd_allreduce: 2744.55 | step: 1273.34 + 98%|█████████▊| 304/309 [1:29:10<01:26, 17.39s/it] {'loss': 0.2937, 'learning_rate': 2.7592909089593224e-08, 'epoch': 2.93} + 98%|█████████▊| 304/309 [1:29:10<01:26, 17.39s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:10:38,787] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 293.78 | bwd_inner_microstep: 293.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:10:39,274] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 310.84 | bwd_inner_microstep: 310.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 377 +[2025-04-26 23:10:39,405] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.05 | bwd_microstep: 86.88 | bwd_inner_microstep: 86.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 377 +[2025-04-26 23:10:39,534] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.89 | bwd_microstep: 86.85 | bwd_inner_microstep: 86.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:10:40,019] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 309.91 | bwd_inner_microstep: 309.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:10:40,506] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 310.89 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:10:40,993] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 309.86 | bwd_inner_microstep: 309.85 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.06 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1402 +[2025-04-26 23:10:41,366] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.50 | bwd_microstep: 239.81 | bwd_inner_microstep: 239.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:10:41,851] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 310.01 | bwd_inner_microstep: 310.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:10:42,338] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.25 | bwd_inner_microstep: 310.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:10:42,825] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 309.74 | bwd_inner_microstep: 309.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:10:43,311] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 309.65 | bwd_inner_microstep: 309.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:10:43,799] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 309.80 | bwd_inner_microstep: 309.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:10:44,286] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 310.14 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:10:44,774] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.51 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:10:45,261] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 309.58 | bwd_inner_microstep: 309.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 366 +[2025-04-26 23:10:45,391] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.94 | bwd_microstep: 85.57 | bwd_inner_microstep: 85.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:10:45,877] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 310.85 | bwd_inner_microstep: 310.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:10:46,365] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:10:46,852] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 309.98 | bwd_inner_microstep: 309.97 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:10:47,339] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2000 +[2025-04-26 23:10:47,854] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.39 | bwd_microstep: 330.34 | bwd_inner_microstep: 330.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 498 +[2025-04-26 23:10:48,005] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 43.96 | bwd_microstep: 102.47 | bwd_inner_microstep: 102.46 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1924 +[2025-04-26 23:10:48,510] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.79 | bwd_microstep: 323.78 | bwd_inner_microstep: 323.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1927 +[2025-04-26 23:10:49,016] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.02 | bwd_microstep: 324.34 | bwd_inner_microstep: 324.33 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:10:49,505] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 310.69 | bwd_inner_microstep: 310.67 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:10:49,994] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:10:50,484] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.67 | bwd_microstep: 311.00 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:10:50,972] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 310.88 | bwd_inner_microstep: 310.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:10:51,461] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.31 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1907 +[2025-04-26 23:10:51,949] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 310.39 | bwd_inner_microstep: 310.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:10:55,214] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.26 | optimizer_gradients: 17.52 | optimizer_step: 32.03 +[2025-04-26 23:10:55,214] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.13 | bwd_microstep: 1817.61 | bwd_inner_microstep: 338.26 | bwd_allreduce_microstep: 1479.31 | step_microstep: 1269.97 +[2025-04-26 23:10:55,216] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4951.65 | bwd: 10518.65 | bwd_inner: 9038.88 | bwd_allreduce: 1479.43 | step: 1271.16 + 99%|█████████▊| 305/309 [1:29:27<01:08, 17.24s/it] {'loss': 0.3126, 'learning_rate': 1.766092409706266e-08, 'epoch': 2.94} + 99%|█████████▊| 305/309 [1:29:27<01:08, 17.24s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:10:55,686] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 294.38 | bwd_inner_microstep: 294.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:10:56,174] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 310.65 | bwd_inner_microstep: 310.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:10:56,663] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.12 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:10:57,151] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 310.04 | bwd_inner_microstep: 310.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:10:57,639] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 310.07 | bwd_inner_microstep: 310.06 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:10:58,126] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:10:58,615] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 310.76 | bwd_inner_microstep: 310.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1399 +[2025-04-26 23:10:58,987] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 128.21 | bwd_microstep: 239.37 | bwd_inner_microstep: 239.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:10:59,474] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 310.83 | bwd_inner_microstep: 310.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1906 +[2025-04-26 23:10:59,961] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 310.37 | bwd_inner_microstep: 310.36 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 370 +[2025-04-26 23:11:00,092] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.62 | bwd_microstep: 86.21 | bwd_inner_microstep: 86.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:00,579] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 310.33 | bwd_inner_microstep: 310.31 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:11:01,067] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 311.72 | bwd_inner_microstep: 311.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:01,555] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.02 | bwd_inner_microstep: 310.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:02,042] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:02,530] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 310.43 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:11:03,020] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 311.19 | bwd_inner_microstep: 311.17 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:11:03,507] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:11:03,995] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 310.43 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:04,482] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 309.62 | bwd_inner_microstep: 309.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:11:04,970] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 310.20 | bwd_inner_microstep: 310.19 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:11:05,457] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 309.27 | bwd_inner_microstep: 309.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 23:11:05,963] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.52 | bwd_microstep: 323.00 | bwd_inner_microstep: 322.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1918 +[2025-04-26 23:11:06,452] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.17 | bwd_microstep: 311.40 | bwd_inner_microstep: 311.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1919 +[2025-04-26 23:11:06,943] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 311.83 | bwd_inner_microstep: 311.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:11:07,431] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:11:07,920] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:11:08,409] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.83 | bwd_microstep: 310.74 | bwd_inner_microstep: 310.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:11:08,899] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 311.45 | bwd_inner_microstep: 311.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 375 +[2025-04-26 23:11:09,030] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.45 | bwd_microstep: 86.72 | bwd_inner_microstep: 86.71 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:11:09,161] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.00 | bwd_microstep: 86.74 | bwd_inner_microstep: 86.73 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:11:11,745] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1206.17 | optimizer_gradients: 17.55 | optimizer_step: 32.04 +[2025-04-26 23:11:11,745] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 1134.15 | bwd_inner_microstep: 369.40 | bwd_allreduce_microstep: 764.35 | step_microstep: 1274.03 +[2025-04-26 23:11:11,747] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5076.00 | bwd: 10014.65 | bwd_inner: 9249.66 | bwd_allreduce: 764.43 | step: 1275.38 + 99%|█████████▉| 306/309 [1:29:44<00:51, 17.03s/it] {'loss': 0.1625, 'learning_rate': 9.934909589646157e-09, 'epoch': 2.95} + 99%|█████████▉| 306/309 [1:29:44<00:51, 17.03s/it]dynamic ViT batch size: 5, images per sample: 5.0, dynamic token length: 1403 +[2025-04-26 23:11:12,104] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 127.01 | bwd_microstep: 223.95 | bwd_inner_microstep: 223.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:11:12,590] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 310.13 | bwd_inner_microstep: 310.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:11:13,077] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 310.13 | bwd_inner_microstep: 310.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.04 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:13,566] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.99 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.66 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:11:14,054] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 310.22 | bwd_inner_microstep: 310.21 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:14,540] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 310.51 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:11:15,028] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 310.08 | bwd_inner_microstep: 310.07 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 23:11:15,159] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.86 | bwd_microstep: 86.46 | bwd_inner_microstep: 86.45 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:11:15,644] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1905 +[2025-04-26 23:11:16,131] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 310.72 | bwd_inner_microstep: 310.70 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:16,618] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 310.04 | bwd_inner_microstep: 310.02 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:11:17,107] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.11 | bwd_microstep: 310.18 | bwd_inner_microstep: 310.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1903 +[2025-04-26 23:11:17,595] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 310.13 | bwd_inner_microstep: 310.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 373 +[2025-04-26 23:11:17,726] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.06 | bwd_microstep: 86.43 | bwd_inner_microstep: 86.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:11:18,214] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 311.90 | bwd_inner_microstep: 311.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 878 +[2025-04-26 23:11:18,466] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.01 | bwd_microstep: 161.84 | bwd_inner_microstep: 161.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:11:18,954] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:11:19,441] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.76 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:11:19,930] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.84 | bwd_inner_microstep: 310.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:20,417] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 310.52 | bwd_inner_microstep: 310.50 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:20,905] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 310.91 | bwd_inner_microstep: 310.90 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1891 +[2025-04-26 23:11:21,391] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 308.88 | bwd_inner_microstep: 308.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1974 +[2025-04-26 23:11:21,903] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.77 | bwd_microstep: 327.42 | bwd_inner_microstep: 327.41 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1926 +[2025-04-26 23:11:22,409] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.40 | bwd_microstep: 323.01 | bwd_inner_microstep: 323.00 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:11:22,899] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 311.77 | bwd_inner_microstep: 311.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:11:23,389] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.19 | bwd_microstep: 311.82 | bwd_inner_microstep: 311.80 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:11:23,878] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 311.41 | bwd_inner_microstep: 311.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:11:24,366] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 311.03 | bwd_inner_microstep: 311.01 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:11:24,856] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.35 | bwd_microstep: 311.57 | bwd_inner_microstep: 311.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:11:25,346] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.37 | bwd_microstep: 311.39 | bwd_inner_microstep: 311.38 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:11:25,835] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 311.00 | bwd_inner_microstep: 310.99 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:11:29,025] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1201.37 | optimizer_gradients: 17.61 | optimizer_step: 32.03 +[2025-04-26 23:11:29,026] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.32 | bwd_microstep: 1743.41 | bwd_inner_microstep: 339.78 | bwd_allreduce_microstep: 1403.59 | step_microstep: 1269.59 +[2025-04-26 23:11:29,027] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5126.96 | bwd: 10720.08 | bwd_inner: 9316.03 | bwd_allreduce: 1403.71 | step: 1270.76 + 99%|█████████▉| 307/309 [1:30:01<00:34, 17.11s/it] {'loss': 0.2465, 'learning_rate': 4.415718488510745e-09, 'epoch': 2.96} + 99%|█████████▉| 307/309 [1:30:01<00:34, 17.11s/it]dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 23:11:29,141] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 37.74 | bwd_microstep: 70.21 | bwd_inner_microstep: 70.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 891 +[2025-04-26 23:11:29,394] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 84.94 | bwd_microstep: 163.17 | bwd_inner_microstep: 163.16 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1914 +[2025-04-26 23:11:29,880] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.08 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:11:30,369] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.28 | bwd_microstep: 310.63 | bwd_inner_microstep: 310.61 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:30,857] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:31,346] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 310.24 | bwd_inner_microstep: 310.23 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:31,834] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 310.33 | bwd_inner_microstep: 310.32 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 887 +[2025-04-26 23:11:32,087] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 86.35 | bwd_microstep: 162.80 | bwd_inner_microstep: 162.78 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 379 +[2025-04-26 23:11:32,218] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.18 | bwd_microstep: 87.04 | bwd_inner_microstep: 87.03 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:11:32,704] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:11:33,195] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 312.70 | bwd_inner_microstep: 312.68 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:11:33,682] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 310.44 | bwd_inner_microstep: 310.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:11:34,169] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 309.85 | bwd_inner_microstep: 309.83 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:11:34,658] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.98 | bwd_microstep: 310.32 | bwd_inner_microstep: 310.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1904 +[2025-04-26 23:11:35,144] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 309.29 | bwd_inner_microstep: 309.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:11:35,633] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.23 | bwd_microstep: 310.77 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:36,122] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 310.23 | bwd_inner_microstep: 310.22 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:36,610] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 310.83 | bwd_inner_microstep: 310.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:37,097] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 309.85 | bwd_inner_microstep: 309.84 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:37,585] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 310.15 | bwd_inner_microstep: 310.14 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.05 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:38,072] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 309.77 | bwd_inner_microstep: 309.75 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2130 +[2025-04-26 23:11:38,613] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.15 | bwd_microstep: 350.89 | bwd_inner_microstep: 350.88 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 23:11:39,121] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.34 | bwd_microstep: 323.53 | bwd_inner_microstep: 323.52 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 3, images per sample: 3.0, dynamic token length: 902 +[2025-04-26 23:11:39,389] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 91.50 | bwd_microstep: 171.21 | bwd_inner_microstep: 171.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:11:39,878] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 312.95 | bwd_inner_microstep: 312.93 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 382 +[2025-04-26 23:11:40,010] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.43 | bwd_microstep: 87.30 | bwd_inner_microstep: 87.28 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1920 +[2025-04-26 23:11:40,498] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 311.45 | bwd_inner_microstep: 311.44 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 381 +[2025-04-26 23:11:40,630] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.14 | bwd_microstep: 87.19 | bwd_inner_microstep: 87.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:11:41,118] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 312.66 | bwd_inner_microstep: 312.64 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1917 +[2025-04-26 23:11:41,608] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.73 | bwd_microstep: 311.60 | bwd_inner_microstep: 311.59 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:11:42,097] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 311.60 | bwd_inner_microstep: 311.58 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:11:45,735] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1203.80 | optimizer_gradients: 17.51 | optimizer_step: 32.03 +[2025-04-26 23:11:45,736] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.12 | bwd_microstep: 2189.26 | bwd_inner_microstep: 339.55 | bwd_allreduce_microstep: 1849.66 | step_microstep: 1271.52 +[2025-04-26 23:11:45,737] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 4745.08 | bwd: 10529.02 | bwd_inner: 8678.90 | bwd_allreduce: 1849.78 | step: 1272.76 + 100%|█████████▉| 308/309 [1:30:18<00:16, 16.99s/it] {'loss': 0.2177, 'learning_rate': 1.1039600903250958e-09, 'epoch': 2.97} + 100%|█████████▉| 308/309 [1:30:18<00:16, 16.99s/it]dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:11:46,208] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 293.82 | bwd_inner_microstep: 293.81 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:46,696] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 310.71 | bwd_inner_microstep: 310.69 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:11:47,184] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:11:47,671] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 310.45 | bwd_inner_microstep: 310.43 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:11:48,160] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 376 +[2025-04-26 23:11:48,291] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 39.72 | bwd_microstep: 86.84 | bwd_inner_microstep: 86.82 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:48,779] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 312.11 | bwd_inner_microstep: 312.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:49,267] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 311.14 | bwd_inner_microstep: 311.12 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:11:49,755] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 311.20 | bwd_inner_microstep: 311.18 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:11:50,243] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 310.10 | bwd_inner_microstep: 310.09 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:11:50,731] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1909 +[2025-04-26 23:11:51,219] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.18 | bwd_microstep: 310.41 | bwd_inner_microstep: 310.40 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:51,707] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 310.75 | bwd_inner_microstep: 310.74 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:52,195] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 310.58 | bwd_inner_microstep: 310.56 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:52,683] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 309.95 | bwd_inner_microstep: 309.94 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1910 +[2025-04-26 23:11:53,171] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 310.57 | bwd_inner_microstep: 310.55 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:11:53,659] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 310.64 | bwd_inner_microstep: 310.62 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:54,146] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 310.14 | bwd_inner_microstep: 310.13 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:54,634] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 310.27 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1902 +[2025-04-26 23:11:55,121] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 309.88 | bwd_inner_microstep: 309.87 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1893 +[2025-04-26 23:11:55,608] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 310.11 | bwd_inner_microstep: 310.10 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 2059 +[2025-04-26 23:11:56,135] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.16 | bwd_microstep: 338.66 | bwd_inner_microstep: 338.65 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1950 +[2025-04-26 23:11:56,644] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.53 | bwd_microstep: 324.99 | bwd_inner_microstep: 324.98 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1928 +[2025-04-26 23:11:57,150] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.77 | bwd_microstep: 323.21 | bwd_inner_microstep: 323.20 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1921 +[2025-04-26 23:11:57,656] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.15 | bwd_microstep: 322.93 | bwd_inner_microstep: 322.92 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:11:58,145] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.17 | bwd_microstep: 311.50 | bwd_inner_microstep: 311.48 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1912 +[2025-04-26 23:11:58,634] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.22 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.79 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1913 +[2025-04-26 23:11:59,122] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 311.44 | bwd_inner_microstep: 311.42 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.03 +dynamic ViT batch size: 1, images per sample: 1.0, dynamic token length: 380 +[2025-04-26 23:11:59,254] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 40.50 | bwd_microstep: 86.96 | bwd_inner_microstep: 86.95 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1911 +[2025-04-26 23:11:59,741] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 311.32 | bwd_inner_microstep: 311.30 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1915 +[2025-04-26 23:12:00,230] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 312.28 | bwd_inner_microstep: 312.27 | bwd_allreduce_microstep: 0.00 | step_microstep: 0.02 +dynamic ViT batch size: 7, images per sample: 7.0, dynamic token length: 1916 +[2025-04-26 23:12:03,221] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1202.12 | optimizer_gradients: 17.53 | optimizer_step: 32.03 +[2025-04-26 23:12:03,222] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.95 | bwd_microstep: 1544.70 | bwd_inner_microstep: 339.70 | bwd_allreduce_microstep: 1204.96 | step_microstep: 1269.89 +[2025-04-26 23:12:03,223] [INFO] [logging.py:128:log_dist] [Rank 0] time (ms) | fwd: 5278.07 | bwd: 10779.74 | bwd_inner: 9574.30 | bwd_allreduce: 1205.08 | step: 1270.95 + 100%|██████████| 309/309 [1:30:35<00:00, 17.14s/it] {'loss': 0.2132, 'learning_rate': 0.0, 'epoch': 2.98} + 100%|██████████| 309/309 [1:30:35<00:00, 17.14s/it]petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[INFO|trainer.py:1962] 2025-04-26 23:12:04,541 >> + +Training completed. Do not forget to share your model on huggingface.co/models =) + + + {'train_runtime': 5438.4326, 'train_samples_per_second': 5.498, 'train_steps_per_second': 0.057, 'train_loss': 0.5500956740242378, 'epoch': 2.98} + 100%|██████████| 309/309 [1:30:37<00:00, 17.14s/it] 100%|██████████| 309/309 [1:30:37<00:00, 17.60s/it] +[INFO|trainer.py:2936] 2025-04-26 23:12:06,435 >> Saving model checkpoint to work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full +[INFO|configuration_utils.py:473] 2025-04-26 23:12:06,436 >> Configuration saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/config.json +[INFO|configuration_utils.py:594] 2025-04-26 23:12:06,436 >> Configuration saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/generation_config.json +[INFO|modeling_utils.py:2493] 2025-04-26 23:12:11,918 >> Model weights saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/model.safetensors +[INFO|tokenization_utils_base.py:2433] 2025-04-26 23:12:11,920 >> tokenizer config file saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/tokenizer_config.json +[INFO|tokenization_utils_base.py:2442] 2025-04-26 23:12:11,920 >> Special tokens file saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/special_tokens_map.json +[INFO|tokenization_utils_base.py:2493] 2025-04-26 23:12:11,920 >> added tokens file saved in work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/added_tokens.json +***** train metrics ***** + epoch = 2.98 + train_loss = 0.5501 + train_runtime = 1:30:38.43 + train_samples = 9967 + train_samples_per_second = 5.498 + train_steps_per_second = 0.057 +wandb: +wandb: 🚀 View run divine-snowball-133 at: https://wandb.ai/dyang39/huggingface/runs/j09wcelk +wandb: Find logs at: wandb/run-20250426_214126-j09wcelk/logs +[rank0]:[W426 23:12:14.623626820 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +W0426 23:23:11.970237 1883855 site-packages/torch/distributed/run.py:766] +W0426 23:23:11.970237 1883855 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 23:23:11.970237 1883855 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 23:23:11.970237 1883855 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 23:44:32.046552 1897763 site-packages/torch/distributed/run.py:766] +W0426 23:44:32.046552 1897763 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 23:44:32.046552 1897763 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 23:44:32.046552 1897763 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-26 23:44:33,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:44:33,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:44:33,307] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 23:44:36,323] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 23:44:36,323] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[rank0]: Traceback (most recent call last): +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank0]: main() +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 815, in main +[rank0]: model_args, data_args, training_args = parser.parse_args_into_dataclasses() +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses +[rank0]: obj = dtype(**inputs) +[rank0]: File "", line 121, in __init__ +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/training_args.py", line 1397, in __post_init__ +[rank0]: raise ValueError( +[rank0]: ValueError: --load_best_model_at_end requires the saving steps to be a round multiple of the evaluation steps, but found 25, which is not a round multiple of 10. +[2025-04-26 23:44:36,454] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 23:44:36,454] [INFO] [comm.py:652:init_distributed] cdb=None +[rank2]: Traceback (most recent call last): +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank2]: main() +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 815, in main +[rank2]: model_args, data_args, training_args = parser.parse_args_into_dataclasses() +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses +[rank2]: obj = dtype(**inputs) +[rank2]: File "", line 121, in __init__ +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/training_args.py", line 1397, in __post_init__ +[rank2]: raise ValueError( +[rank2]: ValueError: --load_best_model_at_end requires the saving steps to be a round multiple of the evaluation steps, but found 25, which is not a round multiple of 10. +[rank1]: Traceback (most recent call last): +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank1]: main() +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 815, in main +[rank1]: model_args, data_args, training_args = parser.parse_args_into_dataclasses() +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses +[rank1]: obj = dtype(**inputs) +[rank1]: File "", line 121, in __init__ +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/training_args.py", line 1397, in __post_init__ +[rank1]: raise ValueError( +[rank1]: ValueError: --load_best_model_at_end requires the saving steps to be a round multiple of the evaluation steps, but found 25, which is not a round multiple of 10. +[rank0]:[W426 23:44:36.321293611 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +W0426 23:44:37.290389 1897763 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1897825 closing signal SIGTERM +W0426 23:44:37.290837 1897763 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1897826 closing signal SIGTERM +E0426 23:44:37.705573 1897763 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 1897824) of binary: /data/diji/.conda/envs/internvl/bin/python +Traceback (most recent call last): + File "/data/diji/.conda/envs/internvl/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==2.7.0', 'console_scripts', 'torchrun')()) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl_chat/internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-04-26_23:44:37 + host : nlp-in-477-l.soe.ucsc.edu + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 1897824) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +W0426 23:46:39.558783 1899170 site-packages/torch/distributed/run.py:766] +W0426 23:46:39.558783 1899170 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 23:46:39.558783 1899170 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 23:46:39.558783 1899170 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-26 23:46:40,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:46:40,831] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:46:40,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 23:46:43,819] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 23:46:43,819] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +04/26/2025 23:46:43 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:46:43 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=internvl_chat/zero_stage1_config.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=10, +evaluation_strategy=steps, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=32, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=False, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=4e-05, +length_column_name=length, +load_best_model_at_end=True, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/runs/Apr26_23-46-43_nlp-in-477-l.soe.ucsc.edu, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=eval_loss, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=adamw_torch, +optim_args=None, +output_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=10, +save_strategy=steps, +save_total_limit=2, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.01, +) +04/26/2025 23:46:43 - INFO - __main__ - Loading Tokenizer: pretrained/InternVL2_5-2B +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:46:43,904 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:46:43,904 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:46:43,904 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:46:43,904 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:46:43,904 >> loading file tokenizer.json +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 23:46:43,953] [INFO] [comm.py:652:init_distributed] cdb=None +[WARNING|logging.py:314] 2025-04-26 23:46:44,012 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[2025-04-26 23:46:44,054] [INFO] [comm.py:652:init_distributed] cdb=None +04/26/2025 23:46:44 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:46:44 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:727] 2025-04-26 23:46:44,098 >> loading configuration file pretrained/InternVL2_5-2B/config.json +[INFO|configuration_utils.py:792] 2025-04-26 23:46:44,099 >> Model config InternVLChatConfig { + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 2048, + "llm_config": { + "_name_or_path": "internlm/internlm2_5-1_8b-chat", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "flash_attention_2", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForSequenceClassification": "modeling_internlm2.InternLM2ForSequenceClassification" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} + +04/26/2025 23:46:44 - INFO - __main__ - Using flash_attention_2 for InternLM +[INFO|modeling_utils.py:3473] 2025-04-26 23:46:44,100 >> loading weights file pretrained/InternVL2_5-2B/model.safetensors +[INFO|modeling_utils.py:1426] 2025-04-26 23:46:44,111 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:826] 2025-04-26 23:46:44,112 >> Generate config GenerationConfig {} + +04/26/2025 23:46:44 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +[INFO|configuration_utils.py:826] 2025-04-26 23:46:44,140 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +[WARNING|logging.py:314] 2025-04-26 23:46:44,181 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2025-04-26 23:46:44,245 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|modeling_utils.py:4350] 2025-04-26 23:46:46,893 >> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4358] 2025-04-26 23:46:46,893 >> All the weights of InternVLChatModel were initialized from the model checkpoint at pretrained/InternVL2_5-2B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:779] 2025-04-26 23:46:46,896 >> loading configuration file pretrained/InternVL2_5-2B/generation_config.json +[INFO|configuration_utils.py:826] 2025-04-26 23:46:46,896 >> Generate config GenerationConfig { + "eos_token_id": [ + 92542, + 92543 + ] +} + +04/26/2025 23:46:47 - INFO - __main__ - Finished +04/26/2025 23:46:47 - INFO - __main__ - model.config.force_image_size: 448 +04/26/2025 23:46:47 - INFO - __main__ - data_args.force_image_size: 448 +04/26/2025 23:46:47 - INFO - __main__ - model.config.vision_config.image_size: 448 +04/26/2025 23:46:47 - INFO - __main__ - [Dataset] num_image_token: 256 +04/26/2025 23:46:47 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/26/2025 23:46:47 - INFO - __main__ - [Dataset] use_thumbnail: True +04/26/2025 23:46:47 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/26/2025 23:46:47 - INFO - __main__ - Formatting inputs...Skip in lazy mode +04/26/2025 23:46:47 - INFO - __main__ - Add dataset: bbox with length: 9967 +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.tok_embeddings.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.0.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.0.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.0.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.0.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.0.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.0.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.0.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.1.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.1.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.1.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.1.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.1.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.1.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.1.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.2.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.2.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.2.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.2.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.2.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.2.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.2.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.3.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.3.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.3.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.3.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.3.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.3.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.3.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.4.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.4.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.4.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.4.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.4.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.4.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.4.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.5.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.5.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.5.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.5.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.5.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.5.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.5.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.6.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.6.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.6.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.6.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.6.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.6.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.6.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.7.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.7.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.7.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.7.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.7.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.7.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.7.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.8.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.8.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.8.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.8.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.8.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.8.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.8.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.9.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.9.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.9.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.9.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.9.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.9.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.9.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.10.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.10.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.10.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.10.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.10.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.10.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.10.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.11.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.11.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.11.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.11.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.11.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.11.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.11.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.12.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.12.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.12.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.12.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.12.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.12.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.12.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.13.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.13.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.13.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.13.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.13.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.13.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.13.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.14.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.14.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.14.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.14.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.14.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.14.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.14.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.15.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.15.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.15.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.15.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.15.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.15.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.15.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.16.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.16.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.16.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.16.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.16.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.16.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.16.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.17.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.17.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.17.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.17.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.17.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.17.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.17.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.18.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.18.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.18.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.18.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.18.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.18.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.18.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.19.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.19.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.19.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.19.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.19.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.19.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.19.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.20.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.20.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.20.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.20.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.20.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.20.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.20.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.21.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.21.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.21.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.21.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.21.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.21.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.21.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.22.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.22.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.22.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.22.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.22.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.22.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.22.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.23.attention.wqkv.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.23.attention.wo.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.23.feed_forward.w1.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.23.feed_forward.w3.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.23.feed_forward.w2.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.23.attention_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.layers.23.ffn_norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.model.norm.weight +04/26/2025 23:46:47 - INFO - __main__ - language_model.output.weight +04/26/2025 23:46:47 - INFO - __main__ - mlp1.0.weight +04/26/2025 23:46:47 - INFO - __main__ - mlp1.0.bias +04/26/2025 23:46:47 - INFO - __main__ - mlp1.1.weight +04/26/2025 23:46:47 - INFO - __main__ - mlp1.1.bias +04/26/2025 23:46:47 - INFO - __main__ - mlp1.3.weight +04/26/2025 23:46:47 - INFO - __main__ - mlp1.3.bias +[INFO|trainer.py:571] 2025-04-26 23:46:47,160 >> Using auto half precision backend +[2025-04-26 23:46:47,348] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.15.4, git-hash=unknown, git-branch=unknown +[2025-04-26 23:46:47,348] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 3 +[2025-04-26 23:46:48,227] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file /data/diji/.cache/torch_extensions/py39_cu126/fused_adam/build.ninja... +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.3347017765045166 seconds +Loading extension module fused_adam... +Time to load fused_adam op: 0.4022197723388672 seconds +Loading extension module fused_adam... +Time to load fused_adam op: 0.4013853073120117 seconds +[2025-04-26 23:46:49,093] [INFO] [logging.py:128:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer +[2025-04-26 23:46:49,093] [INFO] [logging.py:128:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2025-04-26 23:46:49,099] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +[2025-04-26 23:46:49,099] [INFO] [utils.py:59:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[2025-04-26 23:46:49,099] [INFO] [logging.py:128:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 1 optimizer +[2025-04-26 23:46:49,099] [INFO] [stage_1_and_2.py:149:__init__] Reduce bucket size 1000000000 +[2025-04-26 23:46:49,099] [INFO] [stage_1_and_2.py:150:__init__] Allgather bucket size 1000000000 +[2025-04-26 23:46:49,099] [INFO] [stage_1_and_2.py:151:__init__] CPU Offload: False +[2025-04-26 23:46:49,099] [INFO] [stage_1_and_2.py:152:__init__] Round robin gradient partitioning: False +[2025-04-26 23:46:53,123] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states +[2025-04-26 23:46:53,123] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 8.03 GB CA 8.4 GB Max_CA 8 GB +[2025-04-26 23:46:53,123] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.86 GB, percent = 13.9% +[2025-04-26 23:46:53,276] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states +[2025-04-26 23:46:53,277] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 9.21 GB CA 10.76 GB Max_CA 11 GB +[2025-04-26 23:46:53,277] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.86 GB, percent = 13.9% +[2025-04-26 23:46:53,277] [INFO] [stage_1_and_2.py:544:__init__] optimizer state initialized +[2025-04-26 23:46:53,425] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer +[2025-04-26 23:46:53,425] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 6.85 GB CA 10.76 GB Max_CA 11 GB +[2025-04-26 23:46:53,426] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.86 GB, percent = 13.9% +[2025-04-26 23:46:53,427] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer +[2025-04-26 23:46:53,427] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler +[2025-04-26 23:46:53,427] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2025-04-26 23:46:53,427] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[[0.9, 0.999]] +[2025-04-26 23:46:53,428] [INFO] [config.py:999:print] DeepSpeedEngine configuration: +[2025-04-26 23:46:53,428] [INFO] [config.py:1003:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-04-26 23:46:53,428] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2025-04-26 23:46:53,428] [INFO] [config.py:1003:print] amp_enabled .................. False +[2025-04-26 23:46:53,428] [INFO] [config.py:1003:print] amp_params ................... False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] bfloat16_enabled ............. True +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] bfloat16_immediate_grad_update False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] comms_config ................. +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] communication_data_type ...... None +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] dataloader_drop_last ......... False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] disable_allgather ............ False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] dump_state ................... False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... None +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] elasticity_enabled ........... False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] fp16_auto_cast ............... None +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] fp16_enabled ................. False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] global_rank .................. 0 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] grad_accum_dtype ............. None +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 32 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] gradient_clipping ............ 1.0 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] graph_harvesting ............. False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 1 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] load_universal_checkpoint .... False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] loss_scale ................... 1.0 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] memory_breakdown ............. False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] mics_shard_size .............. -1 +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] optimizer_name ............... adamw +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] optimizer_params ............. {'lr': 4e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.01} +[2025-04-26 23:46:53,429] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] pld_enabled .................. False +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] pld_params ................... False +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] prescale_gradients ........... False +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] scheduler_name ............... None +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] scheduler_params ............. None +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] seq_parallel_communication_data_type torch.float32 +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] sparse_attention ............. None +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] steps_per_print .............. inf +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] train_batch_size ............. 96 +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 1 +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] use_node_local_storage ....... False +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] wall_clock_breakdown ......... True +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] weight_quantization_config ... None +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] world_size ................... 3 +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=1000000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=1000000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] zero_enabled ................. True +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True +[2025-04-26 23:46:53,430] [INFO] [config.py:1003:print] zero_optimization_stage ...... 1 +[2025-04-26 23:46:53,430] [INFO] [config.py:989:print_user_config] json = { + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1.000000e+09, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1.000000e+09, + "contiguous_gradients": true + }, + "fp16": { + "enabled": false, + "auto_cast": true, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": true + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 4e-05, + "betas": [0.9, 0.999], + "eps": 1e-08, + "weight_decay": 0.01 + } + }, + "gradient_accumulation_steps": 32, + "gradient_clipping": 1.0, + "steps_per_print": inf, + "train_batch_size": 96, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": true +} +[INFO|trainer.py:1721] 2025-04-26 23:46:53,430 >> ***** Running training ***** +[INFO|trainer.py:1722] 2025-04-26 23:46:53,430 >> Num examples = 9,967 +[INFO|trainer.py:1723] 2025-04-26 23:46:53,430 >> Num Epochs = 1 +[INFO|trainer.py:1724] 2025-04-26 23:46:53,430 >> Instantaneous batch size per device = 1 +[INFO|trainer.py:1727] 2025-04-26 23:46:53,430 >> Total train batch size (w. parallel, distributed & accumulation) = 96 +[INFO|trainer.py:1728] 2025-04-26 23:46:53,430 >> Gradient Accumulation steps = 32 +[INFO|trainer.py:1729] 2025-04-26 23:46:53,430 >> Total optimization steps = 103 +[INFO|trainer.py:1730] 2025-04-26 23:46:53,431 >> Number of trainable parameters = 1,901,742,080 + 0%| | 0/103 [00:00 +[rank0]: main() +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1057, in main +[rank0]: train_result = trainer.train(resume_from_checkpoint=checkpoint) +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1539, in train +[rank0]: return inner_training_loop( +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1929, in _inner_training_loop +[rank0]: self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 2291, in _maybe_log_save_evaluate +[rank0]: metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 3091, in evaluate +[rank0]: eval_dataloader = self.get_eval_dataloader(eval_dataset) +[rank0]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 846, in get_eval_dataloader +[rank0]: raise ValueError("Trainer: evaluation requires an eval_dataset.") +[rank0]: ValueError: Trainer: evaluation requires an eval_dataset. +[rank1]: Traceback (most recent call last): +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank1]: main() +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1057, in main +[rank1]: train_result = trainer.train(resume_from_checkpoint=checkpoint) +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1539, in train +[rank1]: return inner_training_loop( +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1929, in _inner_training_loop +[rank1]: self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 2291, in _maybe_log_save_evaluate +[rank1]: metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 3091, in evaluate +[rank1]: eval_dataloader = self.get_eval_dataloader(eval_dataset) +[rank1]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 846, in get_eval_dataloader +[rank1]: raise ValueError("Trainer: evaluation requires an eval_dataset.") +[rank1]: ValueError: Trainer: evaluation requires an eval_dataset. +[rank2]: Traceback (most recent call last): +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1072, in +[rank2]: main() +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1057, in main +[rank2]: train_result = trainer.train(resume_from_checkpoint=checkpoint) +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1539, in train +[rank2]: return inner_training_loop( +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 1929, in _inner_training_loop +[rank2]: self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 2291, in _maybe_log_save_evaluate +[rank2]: metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 3091, in evaluate +[rank2]: eval_dataloader = self.get_eval_dataloader(eval_dataset) +[rank2]: File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/transformers/trainer.py", line 846, in get_eval_dataloader +[rank2]: raise ValueError("Trainer: evaluation requires an eval_dataset.") +[rank2]: ValueError: Trainer: evaluation requires an eval_dataset. + 10%|▉ | 10/103 [03:09<29:18, 18.91s/it] +[rank0]:[W426 23:50:02.051559216 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +W0426 23:50:03.791993 1899170 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1899220 closing signal SIGTERM +W0426 23:50:03.792447 1899170 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1899222 closing signal SIGTERM +E0426 23:50:04.120112 1899170 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 1 (pid: 1899221) of binary: /data/diji/.conda/envs/internvl/bin/python +Traceback (most recent call last): + File "/data/diji/.conda/envs/internvl/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==2.7.0', 'console_scripts', 'torchrun')()) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl_chat/internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-04-26_23:50:03 + host : nlp-in-477-l.soe.ucsc.edu + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 1899221) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +W0426 23:54:17.921472 1907587 site-packages/torch/distributed/run.py:766] +W0426 23:54:17.921472 1907587 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 23:54:17.921472 1907587 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 23:54:17.921472 1907587 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-26 23:54:19,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:54:19,198] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:54:19,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 23:54:22,165] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 23:54:22,165] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +04/26/2025 23:54:22 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:54:22 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=internvl_chat/zero_stage1_config.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=5, +evaluation_strategy=steps, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=32, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=False, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=4e-05, +length_column_name=length, +load_best_model_at_end=True, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/runs/Apr26_23-54-22_nlp-in-477-l.soe.ucsc.edu, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=eval_loss, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=adamw_torch, +optim_args=None, +output_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=5, +save_strategy=steps, +save_total_limit=2, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.01, +) +04/26/2025 23:54:22 - INFO - __main__ - Loading Tokenizer: pretrained/InternVL2_5-2B +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:54:22,293 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:54:22,293 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:54:22,293 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:54:22,293 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:54:22,294 >> loading file tokenizer.json +[2025-04-26 23:54:22,313] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 23:54:22,314] [INFO] [comm.py:652:init_distributed] cdb=None +04/26/2025 23:54:22 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +[WARNING|logging.py:314] 2025-04-26 23:54:22,402 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +04/26/2025 23:54:22 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:54:22 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:727] 2025-04-26 23:54:22,487 >> loading configuration file pretrained/InternVL2_5-2B/config.json +[INFO|configuration_utils.py:792] 2025-04-26 23:54:22,488 >> Model config InternVLChatConfig { + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 2048, + "llm_config": { + "_name_or_path": "internlm/internlm2_5-1_8b-chat", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "flash_attention_2", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForSequenceClassification": "modeling_internlm2.InternLM2ForSequenceClassification" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} + +04/26/2025 23:54:22 - INFO - __main__ - Using flash_attention_2 for InternLM +[INFO|modeling_utils.py:3473] 2025-04-26 23:54:22,489 >> loading weights file pretrained/InternVL2_5-2B/model.safetensors +[INFO|modeling_utils.py:1426] 2025-04-26 23:54:22,499 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:826] 2025-04-26 23:54:22,500 >> Generate config GenerationConfig {} + +[WARNING|logging.py:314] 2025-04-26 23:54:22,504 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2025-04-26 23:54:22,525 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|configuration_utils.py:826] 2025-04-26 23:54:22,528 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +[INFO|modeling_utils.py:4350] 2025-04-26 23:54:25,165 >> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4358] 2025-04-26 23:54:25,165 >> All the weights of InternVLChatModel were initialized from the model checkpoint at pretrained/InternVL2_5-2B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:779] 2025-04-26 23:54:25,168 >> loading configuration file pretrained/InternVL2_5-2B/generation_config.json +[INFO|configuration_utils.py:826] 2025-04-26 23:54:25,168 >> Generate config GenerationConfig { + "eos_token_id": [ + 92542, + 92543 + ] +} + +04/26/2025 23:54:25 - INFO - __main__ - Finished +04/26/2025 23:54:25 - INFO - __main__ - model.config.force_image_size: 448 +04/26/2025 23:54:25 - INFO - __main__ - data_args.force_image_size: 448 +04/26/2025 23:54:25 - INFO - __main__ - model.config.vision_config.image_size: 448 +04/26/2025 23:54:25 - INFO - __main__ - [Dataset] num_image_token: 256 +04/26/2025 23:54:25 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/26/2025 23:54:25 - INFO - __main__ - [Dataset] use_thumbnail: True +04/26/2025 23:54:25 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/26/2025 23:54:25 - INFO - __main__ - Formatting inputs...Skip in lazy mode +04/26/2025 23:54:25 - INFO - __main__ - Add dataset: bbox with length: 9967 +04/26/2025 23:54:25 - INFO - __main__ - [Dataset] num_image_token: 256 +04/26/2025 23:54:25 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/26/2025 23:54:25 - INFO - __main__ - [Dataset] use_thumbnail: True +04/26/2025 23:54:25 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/26/2025 23:54:25 - INFO - __main__ - Formatting inputs...Skip in lazy mode +[rank0]: Traceback (most recent call last): +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank0]: main() +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank0]: eval_dataset = LazySupervisedDataset( +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank0]: self.rng = np.random.default_rng(seed=random_seed) +[rank0]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank0]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank0]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank0]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank0]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank2]: Traceback (most recent call last): +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank2]: main() +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank2]: eval_dataset = LazySupervisedDataset( +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank2]: self.rng = np.random.default_rng(seed=random_seed) +[rank2]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank2]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank2]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank2]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank2]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank1]: Traceback (most recent call last): +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank1]: main() +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank1]: eval_dataset = LazySupervisedDataset( +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank1]: self.rng = np.random.default_rng(seed=random_seed) +[rank1]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank1]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank1]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank1]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank1]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank0]:[W426 23:54:26.204814075 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +W0426 23:54:27.281659 1907587 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1907653 closing signal SIGTERM +W0426 23:54:27.282115 1907587 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1907654 closing signal SIGTERM +E0426 23:54:27.546573 1907587 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 1907652) of binary: /data/diji/.conda/envs/internvl/bin/python +Traceback (most recent call last): + File "/data/diji/.conda/envs/internvl/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==2.7.0', 'console_scripts', 'torchrun')()) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl_chat/internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-04-26_23:54:27 + host : nlp-in-477-l.soe.ucsc.edu + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 1907652) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +W0426 23:54:31.879660 1908010 site-packages/torch/distributed/run.py:766] +W0426 23:54:31.879660 1908010 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 23:54:31.879660 1908010 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 23:54:31.879660 1908010 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-26 23:54:33,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:54:33,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:54:33,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 23:54:36,152] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 23:54:36,152] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +04/26/2025 23:54:36 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:54:36 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=internvl_chat/zero_stage1_config.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=10, +evaluation_strategy=steps, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=32, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=False, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=4e-05, +length_column_name=length, +load_best_model_at_end=True, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/runs/Apr26_23-54-36_nlp-in-477-l.soe.ucsc.edu, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=eval_loss, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=adamw_torch, +optim_args=None, +output_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=10, +save_strategy=steps, +save_total_limit=2, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.01, +) +04/26/2025 23:54:36 - INFO - __main__ - Loading Tokenizer: pretrained/InternVL2_5-2B +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:54:36,285 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:54:36,285 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:54:36,285 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:54:36,286 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:54:36,286 >> loading file tokenizer.json +[2025-04-26 23:54:36,329] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 23:54:36,329] [INFO] [comm.py:652:init_distributed] cdb=None +[WARNING|logging.py:314] 2025-04-26 23:54:36,397 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +04/26/2025 23:54:36 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:54:36 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:54:36 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:727] 2025-04-26 23:54:36,485 >> loading configuration file pretrained/InternVL2_5-2B/config.json +[INFO|configuration_utils.py:792] 2025-04-26 23:54:36,486 >> Model config InternVLChatConfig { + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 2048, + "llm_config": { + "_name_or_path": "internlm/internlm2_5-1_8b-chat", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "flash_attention_2", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForSequenceClassification": "modeling_internlm2.InternLM2ForSequenceClassification" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} + +04/26/2025 23:54:36 - INFO - __main__ - Using flash_attention_2 for InternLM +[INFO|modeling_utils.py:3473] 2025-04-26 23:54:36,487 >> loading weights file pretrained/InternVL2_5-2B/model.safetensors +[INFO|modeling_utils.py:1426] 2025-04-26 23:54:36,497 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:826] 2025-04-26 23:54:36,498 >> Generate config GenerationConfig {} + +[WARNING|logging.py:314] 2025-04-26 23:54:36,522 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|configuration_utils.py:826] 2025-04-26 23:54:36,526 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +[WARNING|logging.py:314] 2025-04-26 23:54:36,560 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|modeling_utils.py:4350] 2025-04-26 23:54:39,141 >> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4358] 2025-04-26 23:54:39,141 >> All the weights of InternVLChatModel were initialized from the model checkpoint at pretrained/InternVL2_5-2B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:779] 2025-04-26 23:54:39,143 >> loading configuration file pretrained/InternVL2_5-2B/generation_config.json +[INFO|configuration_utils.py:826] 2025-04-26 23:54:39,144 >> Generate config GenerationConfig { + "eos_token_id": [ + 92542, + 92543 + ] +} + +04/26/2025 23:54:39 - INFO - __main__ - Finished +04/26/2025 23:54:39 - INFO - __main__ - model.config.force_image_size: 448 +04/26/2025 23:54:39 - INFO - __main__ - data_args.force_image_size: 448 +04/26/2025 23:54:39 - INFO - __main__ - model.config.vision_config.image_size: 448 +04/26/2025 23:54:39 - INFO - __main__ - [Dataset] num_image_token: 256 +04/26/2025 23:54:39 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/26/2025 23:54:39 - INFO - __main__ - [Dataset] use_thumbnail: True +04/26/2025 23:54:39 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/26/2025 23:54:39 - INFO - __main__ - Formatting inputs...Skip in lazy mode +04/26/2025 23:54:39 - INFO - __main__ - Add dataset: bbox with length: 9967 +04/26/2025 23:54:39 - INFO - __main__ - [Dataset] num_image_token: 256 +04/26/2025 23:54:39 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/26/2025 23:54:39 - INFO - __main__ - [Dataset] use_thumbnail: True +04/26/2025 23:54:39 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/26/2025 23:54:39 - INFO - __main__ - Formatting inputs...Skip in lazy mode +[rank0]: Traceback (most recent call last): +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank0]: main() +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank0]: eval_dataset = LazySupervisedDataset( +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank0]: self.rng = np.random.default_rng(seed=random_seed) +[rank0]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank0]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank0]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank0]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank0]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank1]: Traceback (most recent call last): +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank1]: main() +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank1]: eval_dataset = LazySupervisedDataset( +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank1]: self.rng = np.random.default_rng(seed=random_seed) +[rank1]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank1]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank1]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank1]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank1]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank2]: Traceback (most recent call last): +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank2]: main() +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank2]: eval_dataset = LazySupervisedDataset( +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank2]: self.rng = np.random.default_rng(seed=random_seed) +[rank2]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank2]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank2]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank2]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank2]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank0]:[W426 23:54:40.160111618 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +W0426 23:54:41.235127 1908010 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1908072 closing signal SIGTERM +W0426 23:54:41.235920 1908010 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1908073 closing signal SIGTERM +E0426 23:54:41.400376 1908010 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 1908071) of binary: /data/diji/.conda/envs/internvl/bin/python +Traceback (most recent call last): + File "/data/diji/.conda/envs/internvl/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==2.7.0', 'console_scripts', 'torchrun')()) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl_chat/internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-04-26_23:54:41 + host : nlp-in-477-l.soe.ucsc.edu + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 1908071) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +W0426 23:55:19.644485 1908813 site-packages/torch/distributed/run.py:766] +W0426 23:55:19.644485 1908813 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 23:55:19.644485 1908813 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 23:55:19.644485 1908813 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-26 23:55:20,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:55:20,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:55:20,909] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 23:55:23,848] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 23:55:23,848] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +04/26/2025 23:55:23 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:55:23 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=internvl_chat/zero_stage1_config.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=10, +evaluation_strategy=steps, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=32, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=False, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=4e-05, +length_column_name=length, +load_best_model_at_end=True, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/runs/Apr26_23-55-23_nlp-in-477-l.soe.ucsc.edu, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=eval_loss, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=adamw_torch, +optim_args=None, +output_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=10, +save_strategy=steps, +save_total_limit=2, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.01, +) +04/26/2025 23:55:23 - INFO - __main__ - Loading Tokenizer: pretrained/InternVL2_5-2B +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:55:23,983 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:55:23,983 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:55:23,983 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:55:23,983 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:55:23,983 >> loading file tokenizer.json +[2025-04-26 23:55:24,022] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 23:55:24,023] [INFO] [comm.py:652:init_distributed] cdb=None +[WARNING|logging.py:314] 2025-04-26 23:55:24,093 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +04/26/2025 23:55:24 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:55:24 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:55:24 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:727] 2025-04-26 23:55:24,181 >> loading configuration file pretrained/InternVL2_5-2B/config.json +[INFO|configuration_utils.py:792] 2025-04-26 23:55:24,182 >> Model config InternVLChatConfig { + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 2048, + "llm_config": { + "_name_or_path": "internlm/internlm2_5-1_8b-chat", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "flash_attention_2", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForSequenceClassification": "modeling_internlm2.InternLM2ForSequenceClassification" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} + +04/26/2025 23:55:24 - INFO - __main__ - Using flash_attention_2 for InternLM +[INFO|modeling_utils.py:3473] 2025-04-26 23:55:24,183 >> loading weights file pretrained/InternVL2_5-2B/model.safetensors +[INFO|modeling_utils.py:1426] 2025-04-26 23:55:24,194 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:826] 2025-04-26 23:55:24,195 >> Generate config GenerationConfig {} + +[WARNING|logging.py:314] 2025-04-26 23:55:24,219 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|configuration_utils.py:826] 2025-04-26 23:55:24,223 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +[WARNING|logging.py:314] 2025-04-26 23:55:24,235 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|modeling_utils.py:4350] 2025-04-26 23:55:27,025 >> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4358] 2025-04-26 23:55:27,025 >> All the weights of InternVLChatModel were initialized from the model checkpoint at pretrained/InternVL2_5-2B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:779] 2025-04-26 23:55:27,028 >> loading configuration file pretrained/InternVL2_5-2B/generation_config.json +[INFO|configuration_utils.py:826] 2025-04-26 23:55:27,028 >> Generate config GenerationConfig { + "eos_token_id": [ + 92542, + 92543 + ] +} + +04/26/2025 23:55:27 - INFO - __main__ - Finished +04/26/2025 23:55:27 - INFO - __main__ - model.config.force_image_size: 448 +04/26/2025 23:55:27 - INFO - __main__ - data_args.force_image_size: 448 +04/26/2025 23:55:27 - INFO - __main__ - model.config.vision_config.image_size: 448 +04/26/2025 23:55:27 - INFO - __main__ - [Dataset] num_image_token: 256 +04/26/2025 23:55:27 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/26/2025 23:55:27 - INFO - __main__ - [Dataset] use_thumbnail: True +04/26/2025 23:55:27 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/26/2025 23:55:27 - INFO - __main__ - Formatting inputs...Skip in lazy mode +[rank1]: Traceback (most recent call last): +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank1]: main() +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank1]: eval_dataset = LazySupervisedDataset( +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank1]: self.rng = np.random.default_rng(seed=random_seed) +[rank1]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank1]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank1]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank1]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank1]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +04/26/2025 23:55:27 - INFO - __main__ - Add dataset: bbox with length: 9967 +04/26/2025 23:55:27 - INFO - __main__ - [Dataset] num_image_token: 256 +04/26/2025 23:55:27 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/26/2025 23:55:27 - INFO - __main__ - [Dataset] use_thumbnail: True +04/26/2025 23:55:27 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/26/2025 23:55:27 - INFO - __main__ - Formatting inputs...Skip in lazy mode +[rank0]: Traceback (most recent call last): +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank0]: main() +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank0]: eval_dataset = LazySupervisedDataset( +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank0]: self.rng = np.random.default_rng(seed=random_seed) +[rank0]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank0]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank0]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank0]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank0]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank2]: Traceback (most recent call last): +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank2]: main() +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank2]: eval_dataset = LazySupervisedDataset( +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank2]: self.rng = np.random.default_rng(seed=random_seed) +[rank2]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank2]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank2]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank2]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank2]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank0]:[W426 23:55:28.237132869 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +W0426 23:55:29.199494 1908813 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1908868 closing signal SIGTERM +W0426 23:55:29.200260 1908813 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1908870 closing signal SIGTERM +E0426 23:55:29.746930 1908813 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 1 (pid: 1908869) of binary: /data/diji/.conda/envs/internvl/bin/python +Traceback (most recent call last): + File "/data/diji/.conda/envs/internvl/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==2.7.0', 'console_scripts', 'torchrun')()) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl_chat/internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-04-26_23:55:29 + host : nlp-in-477-l.soe.ucsc.edu + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 1908869) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +W0426 23:58:21.778661 1910742 site-packages/torch/distributed/run.py:766] +W0426 23:58:21.778661 1910742 site-packages/torch/distributed/run.py:766] ***************************************** +W0426 23:58:21.778661 1910742 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0426 23:58:21.778661 1910742 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-26 23:58:23,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:58:23,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-26 23:58:23,071] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-26 23:58:26,337] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 23:58:26,338] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +04/26/2025 23:58:26 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:58:26 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=internvl_chat/zero_stage1_config.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=10, +evaluation_strategy=steps, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=32, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=False, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=4e-05, +length_column_name=length, +load_best_model_at_end=True, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/runs/Apr26_23-58-26_nlp-in-477-l.soe.ucsc.edu, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=eval_loss, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=adamw_torch, +optim_args=None, +output_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=10, +save_strategy=steps, +save_total_limit=2, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.01, +) +04/26/2025 23:58:26 - INFO - __main__ - Loading Tokenizer: pretrained/InternVL2_5-2B +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:58:26,468 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:58:26,468 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:58:26,468 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:58:26,468 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2025-04-26 23:58:26,468 >> loading file tokenizer.json +[2025-04-26 23:58:26,502] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-26 23:58:26,518] [INFO] [comm.py:652:init_distributed] cdb=None +[WARNING|logging.py:314] 2025-04-26 23:58:26,576 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +04/26/2025 23:58:26 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:727] 2025-04-26 23:58:26,664 >> loading configuration file pretrained/InternVL2_5-2B/config.json +[INFO|configuration_utils.py:792] 2025-04-26 23:58:26,665 >> Model config InternVLChatConfig { + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 2048, + "llm_config": { + "_name_or_path": "internlm/internlm2_5-1_8b-chat", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "flash_attention_2", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForSequenceClassification": "modeling_internlm2.InternLM2ForSequenceClassification" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} + +04/26/2025 23:58:26 - INFO - __main__ - Using flash_attention_2 for InternLM +[INFO|modeling_utils.py:3473] 2025-04-26 23:58:26,666 >> loading weights file pretrained/InternVL2_5-2B/model.safetensors +[INFO|modeling_utils.py:1426] 2025-04-26 23:58:26,676 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:826] 2025-04-26 23:58:26,677 >> Generate config GenerationConfig {} + +[INFO|configuration_utils.py:826] 2025-04-26 23:58:26,706 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +04/26/2025 23:58:26 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +04/26/2025 23:58:26 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +[WARNING|logging.py:314] 2025-04-26 23:58:27,057 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2025-04-26 23:58:27,059 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|modeling_utils.py:4350] 2025-04-26 23:58:29,484 >> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4358] 2025-04-26 23:58:29,484 >> All the weights of InternVLChatModel were initialized from the model checkpoint at pretrained/InternVL2_5-2B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:779] 2025-04-26 23:58:29,486 >> loading configuration file pretrained/InternVL2_5-2B/generation_config.json +[INFO|configuration_utils.py:826] 2025-04-26 23:58:29,486 >> Generate config GenerationConfig { + "eos_token_id": [ + 92542, + 92543 + ] +} + +04/26/2025 23:58:29 - INFO - __main__ - Finished +04/26/2025 23:58:29 - INFO - __main__ - model.config.force_image_size: 448 +04/26/2025 23:58:29 - INFO - __main__ - data_args.force_image_size: 448 +04/26/2025 23:58:29 - INFO - __main__ - model.config.vision_config.image_size: 448 +04/26/2025 23:58:29 - INFO - __main__ - [Dataset] num_image_token: 256 +04/26/2025 23:58:29 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/26/2025 23:58:29 - INFO - __main__ - [Dataset] use_thumbnail: True +04/26/2025 23:58:29 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/26/2025 23:58:29 - INFO - __main__ - Formatting inputs...Skip in lazy mode +[rank0]: Traceback (most recent call last): +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1106, in +[rank0]: main() +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1015, in main +[rank0]: train_dataset, eval_dataset = build_datasets( +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 727, in build_datasets +[rank0]: train_dataset = LazySupervisedDataset( +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank0]: self.rng = np.random.default_rng(seed=random_seed) +[rank0]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank0]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank0]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank0]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank0]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank2]: Traceback (most recent call last): +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1106, in +[rank2]: main() +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1015, in main +[rank2]: train_dataset, eval_dataset = build_datasets( +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 727, in build_datasets +[rank2]: train_dataset = LazySupervisedDataset( +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank2]: self.rng = np.random.default_rng(seed=random_seed) +[rank2]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank2]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank2]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank2]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank2]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank1]: Traceback (most recent call last): +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1106, in +[rank1]: main() +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1015, in main +[rank1]: train_dataset, eval_dataset = build_datasets( +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 727, in build_datasets +[rank1]: train_dataset = LazySupervisedDataset( +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank1]: self.rng = np.random.default_rng(seed=random_seed) +[rank1]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank1]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank1]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank1]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank1]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank0]:[W426 23:58:31.672627967 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +W0426 23:58:31.836450 1910742 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1910823 closing signal SIGTERM +E0426 23:58:31.950660 1910742 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 1910822) of binary: /data/diji/.conda/envs/internvl/bin/python +Traceback (most recent call last): + File "/data/diji/.conda/envs/internvl/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==2.7.0', 'console_scripts', 'torchrun')()) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl_chat/internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2025-04-26_23:58:31 + host : nlp-in-477-l.soe.ucsc.edu + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 1910824) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-04-26_23:58:31 + host : nlp-in-477-l.soe.ucsc.edu + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 1910822) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +W0427 00:02:14.858589 1913336 site-packages/torch/distributed/run.py:766] +W0427 00:02:14.858589 1913336 site-packages/torch/distributed/run.py:766] ***************************************** +W0427 00:02:14.858589 1913336 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0427 00:02:14.858589 1913336 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-27 00:02:16,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-27 00:02:16,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-27 00:02:16,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-27 00:02:19,114] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-27 00:02:19,114] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +04/27/2025 00:02:19 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +04/27/2025 00:02:19 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=internvl_chat/zero_stage1_config.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=10, +evaluation_strategy=steps, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=32, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=False, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=4e-05, +length_column_name=length, +load_best_model_at_end=True, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/runs/Apr27_00-02-19_nlp-in-477-l.soe.ucsc.edu, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=eval_loss, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=adamw_torch, +optim_args=None, +output_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=10, +save_strategy=steps, +save_total_limit=2, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.01, +) +04/27/2025 00:02:19 - INFO - __main__ - Loading Tokenizer: pretrained/InternVL2_5-2B +[INFO|tokenization_utils_base.py:2025] 2025-04-27 00:02:19,495 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2025-04-27 00:02:19,495 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2025-04-27 00:02:19,495 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2025-04-27 00:02:19,495 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2025-04-27 00:02:19,495 >> loading file tokenizer.json +[2025-04-27 00:02:19,574] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-27 00:02:19,574] [INFO] [comm.py:652:init_distributed] cdb=None +[WARNING|logging.py:314] 2025-04-27 00:02:19,605 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +04/27/2025 00:02:19 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +04/27/2025 00:02:19 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +04/27/2025 00:02:19 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:727] 2025-04-27 00:02:19,693 >> loading configuration file pretrained/InternVL2_5-2B/config.json +[INFO|configuration_utils.py:792] 2025-04-27 00:02:19,694 >> Model config InternVLChatConfig { + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 2048, + "llm_config": { + "_name_or_path": "internlm/internlm2_5-1_8b-chat", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "flash_attention_2", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForSequenceClassification": "modeling_internlm2.InternLM2ForSequenceClassification" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} + +04/27/2025 00:02:19 - INFO - __main__ - Using flash_attention_2 for InternLM +[INFO|modeling_utils.py:3473] 2025-04-27 00:02:19,695 >> loading weights file pretrained/InternVL2_5-2B/model.safetensors +[INFO|modeling_utils.py:1426] 2025-04-27 00:02:19,705 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:826] 2025-04-27 00:02:19,706 >> Generate config GenerationConfig {} + +[INFO|configuration_utils.py:826] 2025-04-27 00:02:19,734 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +[WARNING|logging.py:314] 2025-04-27 00:02:19,768 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2025-04-27 00:02:19,780 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|modeling_utils.py:4350] 2025-04-27 00:02:22,502 >> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4358] 2025-04-27 00:02:22,502 >> All the weights of InternVLChatModel were initialized from the model checkpoint at pretrained/InternVL2_5-2B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:779] 2025-04-27 00:02:22,505 >> loading configuration file pretrained/InternVL2_5-2B/generation_config.json +[INFO|configuration_utils.py:826] 2025-04-27 00:02:22,505 >> Generate config GenerationConfig { + "eos_token_id": [ + 92542, + 92543 + ] +} + +04/27/2025 00:02:22 - INFO - __main__ - Finished +04/27/2025 00:02:22 - INFO - __main__ - model.config.force_image_size: 448 +04/27/2025 00:02:22 - INFO - __main__ - data_args.force_image_size: 448 +04/27/2025 00:02:22 - INFO - __main__ - model.config.vision_config.image_size: 448 +04/27/2025 00:02:22 - INFO - __main__ - [Dataset] num_image_token: 256 +04/27/2025 00:02:22 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/27/2025 00:02:22 - INFO - __main__ - [Dataset] use_thumbnail: True +04/27/2025 00:02:22 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/27/2025 00:02:22 - INFO - __main__ - Formatting inputs...Skip in lazy mode +04/27/2025 00:02:22 - INFO - __main__ - Add dataset: bbox with length: 9967 +04/27/2025 00:02:22 - INFO - __main__ - [Dataset] num_image_token: 256 +04/27/2025 00:02:22 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/27/2025 00:02:22 - INFO - __main__ - [Dataset] use_thumbnail: True +04/27/2025 00:02:22 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/27/2025 00:02:22 - INFO - __main__ - Formatting inputs...Skip in lazy mode +[rank0]: Traceback (most recent call last): +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank0]: main() +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank0]: eval_dataset = LazySupervisedDataset( +[rank0]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank0]: self.rng = np.random.default_rng(seed=random_seed) +[rank0]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank0]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank0]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank0]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank0]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank1]: Traceback (most recent call last): +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank1]: main() +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank1]: eval_dataset = LazySupervisedDataset( +[rank1]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank1]: self.rng = np.random.default_rng(seed=random_seed) +[rank1]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank1]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank1]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank1]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank1]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank2]: Traceback (most recent call last): +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1110, in +[rank2]: main() +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 996, in main +[rank2]: eval_dataset = LazySupervisedDataset( +[rank2]: File "/data/diji/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 347, in __init__ +[rank2]: self.rng = np.random.default_rng(seed=random_seed) +[rank2]: File "numpy/random/_generator.pyx", line 4957, in numpy.random._generator.default_rng +[rank2]: File "_pcg64.pyx", line 123, in numpy.random._pcg64.PCG64.__init__ +[rank2]: File "bit_generator.pyx", line 535, in numpy.random.bit_generator.BitGenerator.__init__ +[rank2]: File "bit_generator.pyx", line 307, in numpy.random.bit_generator.SeedSequence.__init__ +[rank2]: TypeError: SeedSequence expects int or sequence of ints for entropy not bbox +[rank0]:[W427 00:02:24.617505143 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +W0427 00:02:24.716484 1913336 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1913419 closing signal SIGTERM +E0427 00:02:24.831111 1913336 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 1913417) of binary: /data/diji/.conda/envs/internvl/bin/python +Traceback (most recent call last): + File "/data/diji/.conda/envs/internvl/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==2.7.0', 'console_scripts', 'torchrun')()) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in main + run(args) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/diji/.conda/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl_chat/internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2025-04-27_00:02:24 + host : nlp-in-477-l.soe.ucsc.edu + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 1913418) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-04-27_00:02:24 + host : nlp-in-477-l.soe.ucsc.edu + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 1913417) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +W0427 00:05:19.004981 1915580 site-packages/torch/distributed/run.py:766] +W0427 00:05:19.004981 1915580 site-packages/torch/distributed/run.py:766] ***************************************** +W0427 00:05:19.004981 1915580 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0427 00:05:19.004981 1915580 site-packages/torch/distributed/run.py:766] ***************************************** +[2025-04-27 00:05:20,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-27 00:05:20,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-27 00:05:20,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +[2025-04-27 00:05:23,292] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-27 00:05:23,292] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +04/27/2025 00:05:23 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +04/27/2025 00:05:23 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=internvl_chat/zero_stage1_config.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=32, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=None, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=4e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full/runs/Apr27_00-05-23_nlp-in-477-l.soe.ucsc.edu, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=4.0, +optim=adamw_torch, +optim_args=None, +output_dir=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['wandb'], +resume_from_checkpoint=None, +run_name=work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=steps, +save_total_limit=3, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.01, +) +04/27/2025 00:05:23 - INFO - __main__ - Loading Tokenizer: pretrained/InternVL2_5-2B +[INFO|tokenization_utils_base.py:2025] 2025-04-27 00:05:23,424 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2025-04-27 00:05:23,424 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2025-04-27 00:05:23,424 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2025-04-27 00:05:23,424 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2025-04-27 00:05:23,424 >> loading file tokenizer.json +[2025-04-27 00:05:23,466] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-27 00:05:23,466] [INFO] [comm.py:652:init_distributed] cdb=None +[WARNING|logging.py:314] 2025-04-27 00:05:23,534 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +04/27/2025 00:05:23 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +04/27/2025 00:05:23 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +04/27/2025 00:05:23 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:727] 2025-04-27 00:05:23,622 >> loading configuration file pretrained/InternVL2_5-2B/config.json +[INFO|configuration_utils.py:792] 2025-04-27 00:05:23,623 >> Model config InternVLChatConfig { + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 2048, + "llm_config": { + "_name_or_path": "internlm/internlm2_5-1_8b-chat", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "flash_attention_2", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForSequenceClassification": "modeling_internlm2.InternLM2ForSequenceClassification" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} + +04/27/2025 00:05:23 - INFO - __main__ - Using flash_attention_2 for InternLM +[INFO|modeling_utils.py:3473] 2025-04-27 00:05:23,624 >> loading weights file pretrained/InternVL2_5-2B/model.safetensors +[INFO|modeling_utils.py:1426] 2025-04-27 00:05:23,634 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:826] 2025-04-27 00:05:23,635 >> Generate config GenerationConfig {} + +[INFO|configuration_utils.py:826] 2025-04-27 00:05:23,663 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +[WARNING|logging.py:314] 2025-04-27 00:05:23,694 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2025-04-27 00:05:23,698 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|modeling_utils.py:4350] 2025-04-27 00:05:26,261 >> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4358] 2025-04-27 00:05:26,261 >> All the weights of InternVLChatModel were initialized from the model checkpoint at pretrained/InternVL2_5-2B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:779] 2025-04-27 00:05:26,264 >> loading configuration file pretrained/InternVL2_5-2B/generation_config.json +[INFO|configuration_utils.py:826] 2025-04-27 00:05:26,264 >> Generate config GenerationConfig { + "eos_token_id": [ + 92542, + 92543 + ] +} + +04/27/2025 00:05:26 - INFO - __main__ - Finished +04/27/2025 00:05:26 - INFO - __main__ - model.config.force_image_size: 448 +04/27/2025 00:05:26 - INFO - __main__ - data_args.force_image_size: 448 +04/27/2025 00:05:26 - INFO - __main__ - model.config.vision_config.image_size: 448 +04/27/2025 00:05:26 - INFO - __main__ - [Dataset] num_image_token: 256 +04/27/2025 00:05:26 - INFO - __main__ - [Dataset] dynamic_image_size: True +04/27/2025 00:05:26 - INFO - __main__ - [Dataset] use_thumbnail: True +04/27/2025 00:05:26 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +04/27/2025 00:05:26 - INFO - __main__ - Formatting inputs...Skip in lazy mode +04/27/2025 00:05:26 - INFO - __main__ - Add dataset: bbox with length: 9967 +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.tok_embeddings.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.0.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.0.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.0.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.0.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.0.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.0.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.0.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.1.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.1.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.1.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.1.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.1.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.1.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.1.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.2.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.2.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.2.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.2.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.2.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.2.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.2.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.3.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.3.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.3.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.3.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.3.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.3.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.3.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.4.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.4.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.4.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.4.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.4.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.4.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.4.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.5.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.5.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.5.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.5.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.5.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.5.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.5.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.6.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.6.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.6.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.6.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.6.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.6.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.6.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.7.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.7.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.7.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.7.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.7.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.7.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.7.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.8.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.8.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.8.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.8.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.8.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.8.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.8.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.9.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.9.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.9.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.9.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.9.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.9.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.9.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.10.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.10.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.10.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.10.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.10.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.10.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.10.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.11.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.11.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.11.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.11.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.11.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.11.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.11.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.12.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.12.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.12.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.12.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.12.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.12.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.12.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.13.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.13.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.13.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.13.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.13.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.13.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.13.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.14.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.14.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.14.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.14.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.14.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.14.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.14.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.15.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.15.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.15.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.15.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.15.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.15.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.15.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.16.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.16.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.16.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.16.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.16.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.16.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.16.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.17.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.17.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.17.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.17.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.17.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.17.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.17.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.18.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.18.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.18.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.18.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.18.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.18.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.18.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.19.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.19.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.19.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.19.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.19.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.19.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.19.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.20.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.20.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.20.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.20.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.20.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.20.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.20.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.21.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.21.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.21.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.21.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.21.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.21.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.21.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.22.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.22.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.22.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.22.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.22.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.22.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.22.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.23.attention.wqkv.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.23.attention.wo.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.23.feed_forward.w1.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.23.feed_forward.w3.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.23.feed_forward.w2.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.23.attention_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.layers.23.ffn_norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.model.norm.weight +04/27/2025 00:05:26 - INFO - __main__ - language_model.output.weight +04/27/2025 00:05:26 - INFO - __main__ - mlp1.0.weight +04/27/2025 00:05:26 - INFO - __main__ - mlp1.0.bias +04/27/2025 00:05:26 - INFO - __main__ - mlp1.1.weight +04/27/2025 00:05:26 - INFO - __main__ - mlp1.1.bias +04/27/2025 00:05:26 - INFO - __main__ - mlp1.3.weight +04/27/2025 00:05:26 - INFO - __main__ - mlp1.3.bias +[INFO|trainer.py:571] 2025-04-27 00:05:26,501 >> Using auto half precision backend +[2025-04-27 00:05:26,685] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.15.4, git-hash=unknown, git-branch=unknown +[2025-04-27 00:05:26,685] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 3 +[2025-04-27 00:05:27,797] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Using /data/diji/.cache/torch_extensions/py39_cu126 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file /data/diji/.cache/torch_extensions/py39_cu126/fused_adam/build.ninja... +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.35503411293029785 seconds +Loading extension module fused_adam... +Time to load fused_adam op: 0.40209174156188965 seconds +Loading extension module fused_adam... +Time to load fused_adam op: 0.402071475982666 seconds +[2025-04-27 00:05:28,655] [INFO] [logging.py:128:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer +[2025-04-27 00:05:28,655] [INFO] [logging.py:128:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2025-04-27 00:05:28,661] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +[2025-04-27 00:05:28,661] [INFO] [utils.py:59:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[2025-04-27 00:05:28,661] [INFO] [logging.py:128:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 1 optimizer +[2025-04-27 00:05:28,661] [INFO] [stage_1_and_2.py:149:__init__] Reduce bucket size 1000000000 +[2025-04-27 00:05:28,661] [INFO] [stage_1_and_2.py:150:__init__] Allgather bucket size 1000000000 +[2025-04-27 00:05:28,661] [INFO] [stage_1_and_2.py:151:__init__] CPU Offload: False +[2025-04-27 00:05:28,661] [INFO] [stage_1_and_2.py:152:__init__] Round robin gradient partitioning: False +[2025-04-27 00:05:32,491] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states +[2025-04-27 00:05:32,491] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 8.03 GB CA 8.4 GB Max_CA 8 GB +[2025-04-27 00:05:32,491] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 70.62 GB, percent = 14.0% +[2025-04-27 00:05:32,646] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states +[2025-04-27 00:05:32,647] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 9.21 GB CA 10.76 GB Max_CA 11 GB +[2025-04-27 00:05:32,647] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 70.62 GB, percent = 14.0% +[2025-04-27 00:05:32,647] [INFO] [stage_1_and_2.py:544:__init__] optimizer state initialized +[2025-04-27 00:05:32,802] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer +[2025-04-27 00:05:32,802] [INFO] [utils.py:782:see_memory_usage] MA 6.85 GB Max_MA 6.85 GB CA 10.76 GB Max_CA 11 GB +[2025-04-27 00:05:32,803] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 70.62 GB, percent = 14.0% +[2025-04-27 00:05:32,803] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer +[2025-04-27 00:05:32,804] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler +[2025-04-27 00:05:32,804] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2025-04-27 00:05:32,804] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[[0.9, 0.999]] +[2025-04-27 00:05:32,805] [INFO] [config.py:999:print] DeepSpeedEngine configuration: +[2025-04-27 00:05:32,805] [INFO] [config.py:1003:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-04-27 00:05:32,805] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2025-04-27 00:05:32,805] [INFO] [config.py:1003:print] amp_enabled .................. False +[2025-04-27 00:05:32,805] [INFO] [config.py:1003:print] amp_params ................... False +[2025-04-27 00:05:32,805] [INFO] [config.py:1003:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] bfloat16_enabled ............. True +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] bfloat16_immediate_grad_update False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] comms_config ................. +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] communication_data_type ...... None +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] dataloader_drop_last ......... False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] disable_allgather ............ False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] dump_state ................... False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... None +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] elasticity_enabled ........... False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] fp16_auto_cast ............... None +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] fp16_enabled ................. False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] global_rank .................. 0 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] grad_accum_dtype ............. None +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 32 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] gradient_clipping ............ 1.0 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] graph_harvesting ............. False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 1 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] load_universal_checkpoint .... False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] loss_scale ................... 1.0 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] memory_breakdown ............. False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] mics_shard_size .............. -1 +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-04-27 00:05:32,806] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] optimizer_name ............... adamw +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] optimizer_params ............. {'lr': 4e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.01} +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] pld_enabled .................. False +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] pld_params ................... False +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] prescale_gradients ........... False +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] scheduler_name ............... None +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] scheduler_params ............. None +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] seq_parallel_communication_data_type torch.float32 +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] sparse_attention ............. None +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] steps_per_print .............. inf +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] train_batch_size ............. 96 +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 1 +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] use_node_local_storage ....... False +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] wall_clock_breakdown ......... True +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] weight_quantization_config ... None +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] world_size ................... 3 +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=1000000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=1000000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] zero_enabled ................. True +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True +[2025-04-27 00:05:32,807] [INFO] [config.py:1003:print] zero_optimization_stage ...... 1 +[2025-04-27 00:05:32,807] [INFO] [config.py:989:print_user_config] json = { + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1.000000e+09, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1.000000e+09, + "contiguous_gradients": true + }, + "fp16": { + "enabled": false, + "auto_cast": true, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": true + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 4e-05, + "betas": [0.9, 0.999], + "eps": 1e-08, + "weight_decay": 0.01 + } + }, + "gradient_accumulation_steps": 32, + "gradient_clipping": 1.0, + "steps_per_print": inf, + "train_batch_size": 96, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": true +} +[INFO|trainer.py:1721] 2025-04-27 00:05:32,807 >> ***** Running training ***** +[INFO|trainer.py:1722] 2025-04-27 00:05:32,807 >> Num examples = 9,967 +[INFO|trainer.py:1723] 2025-04-27 00:05:32,807 >> Num Epochs = 4 +[INFO|trainer.py:1724] 2025-04-27 00:05:32,807 >> Instantaneous batch size per device = 1 +[INFO|trainer.py:1727] 2025-04-27 00:05:32,807 >> Total train batch size (w. parallel, distributed & accumulation) = 96 +[INFO|trainer.py:1728] 2025-04-27 00:05:32,807 >> Gradient Accumulation steps = 32 +[INFO|trainer.py:1729] 2025-04-27 00:05:32,807 >> Total optimization steps = 412 +[INFO|trainer.py:1730] 2025-04-27 00:05:32,808 >> Number of trainable parameters = 1,901,742,080 +[INFO|integration_utils.py:722] 2025-04-27 00:05:32,809 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" +wandb: Currently logged in as: dyang39 to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Tracking run with wandb version 0.19.10 +wandb: Run data is saved locally in /data/diji/InternVL/wandb/run-20250427_000533-tupeu7gc +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run driven-elevator-134 +wandb: ⭐️ View project at https://wandb.ai/dyang39/huggingface +wandb: 🚀 View run at https://wandb.ai/dyang39/huggingface/runs/tupeu7gc + 0%| | 0/412 [00:00